In [11]:
%load_ext autoreload
%autoreload 2

In [1]:
import argilla as rg
from argilla.client.feedback.dataset.local import FeedbackDataset
from datasets import load_dataset

In [2]:
rg.init(
    api_url="http://localhost:6900",
    api_key="owner.apikey"
)

This may lead to potential compatibility issues during your experience.
To ensure a seamless and optimized connection, we highly recommend aligning your client version with the server version.


In [3]:
class FeedbackDs(FeedbackDataset):
    
    def __init__(self) -> None:
        """
        task: 
        "question_answering", x 
        "text_classification", x
        "token_classification", 
        "summarization", x
        "translation", x 

        "supervised-fine-tuning" x
        "conversational" x
        "retrieval-augmented-generation" x
        "sentence-similarity"
        """
        pass

    def add_item(self, items):
        """
        Make sure that the keys of the dictionary are the same as the names of the fields.
        :param items: A dictionary containing the fields of the dataset and their values.
        :return: None
        """
        dataset_fields = [self.fields[i].name for i in range(len(self.fields))]
        record_fields = list(items.keys()) #{'context', 'question'} 

        if not set(record_fields).issubset(set(dataset_fields)):
            raise ValueError("Item fields are not subset of dataset fields")
        
        text_fields = {}
        for indx, field_name in enumerate(dataset_fields):
            text_fields[field_name] = items[dataset_fields[indx]]

        dataset_length = [len(value) for key, value in items.items()][0]

        for item in range(dataset_length):
            record = rg.FeedbackRecord(
                fields = {dataset_fields[index]: text_fields[field][item] for index, field in enumerate(dataset_fields)}
            )
            self.add_records(record)
            
    @classmethod
    def for_question_answering(cls, answers=False):
        """
        You can use this method to create a basic dataset for question answering tasks.
        To add items to your dataset, use the "add_item" method.

        :param answers: Set this parameter to True if you want to add answers to your dataset
        :return: A FeedbackDataset object for question answering containing "context", "question" and optionally "answers" fields
        """
        ds = rg.FeedbackDataset(
            fields=[
                rg.TextField(name="context"),
                rg.TextField(name="question")
                ],
            questions=[
                rg.TextQuestion(name="answer")
            ]
        )
        if answers:
            ds.fields.append(
                rg.TextField(name="answer")
                )
   
        custom_ds = cls()
        for attr, value in vars(ds).items():
            setattr(custom_ds, attr, value)
        return custom_ds

    @classmethod
    def for_text_classification(cls, labels, multi_label=False):
        """
        You can use this method to create a basic dataset for text classification tasks.
        :param labels: A list of labels for the classification task.
        :param multi_label: Set this parameter to True if you want to create a multi-label classification dataset.
        :return: A FeedbackDataset object for text classification containing "text" and "label" fields.
        """
        ds = rg.FeedbackDataset(
            fields=[
                rg.TextField(name="text"),
                ],
            questions=[
                rg.LabelQuestion(
                name="label",
                labels=labels
                )
            ]
        )
        if multi_label:
            ds = rg.FeedbackDataset(
            fields=[
                rg.TextField(name="text"),
                ],
            questions=[
                rg.MultiLabelQuestion(
                name="label",
                labels=labels
                )
            ]
        )

        custom_ds = cls()
        for attr, value in vars(ds).items():
            setattr(custom_ds, attr, value)
        return custom_ds
    
    @classmethod
    def for_summarization(cls, labels=False, examples=False):
        """
        You can use this method to create a basic dataset for summarization tasks.
        :param: labels: Set this parameter to True if you want to add labels to have the summaries annotated.
        :param: examples: Set this parameter to True if you want to add examples for text.
        :return: A FeedbackDataset object for summarization containing "text" and "summary" fields.
        """
        ds = rg.FeedbackDataset(
            fields=[
                rg.TextField(name="text"),
                ],
            questions=[
                rg.TextQuestion(
                name="summary",

                )
            ]
        )
        if labels:
            ds.questions.append(
                rg.LabelQuestion(
                name="label",
                labels=labels
                )
            )
        if examples:
            ds.fields.append(
                rg.TextField(name="example")
            )

        custom_ds = cls()
        for attr, value in vars(ds).items():
            setattr(custom_ds, attr, value)
        return custom_ds
    
    @classmethod
    def for_translation(cls, labels=False, examples=False):
        """
        You can use this method to create a basic dataset for translation tasks.
        :param: labels: Set this parameter to True if you want to add labels to have the translations annotated.
        :return: A FeedbackDataset object for translation containing "text" and "translation" fields.
        """
        questions = [rg.TextQuestion(name="translation")]

        if labels:
            questions.append(rg.LabelQuestion(name="label",labels=labels))

        ds = FeedbackDataset(
            fields=[
                rg.TextField(name="text"),
                ],
            questions= questions
        )

        if examples:
            ds.fields.append(
                rg.TextField(name="example",title="Example Translation")
            )

        custom_ds = cls()
        for attr, value in vars(ds).items():
            setattr(custom_ds, attr, value)
        return custom_ds
    
    @classmethod
    def for_supervised_fine_tuning(cls, field_names: list):
        """
        """
        if type(field_names) != list:
            field_names = [field_names]

        ds = rg.FeedbackDataset(
            fields=[
                rg.TextField(name=field) for field in field_names
                ],
            questions=[
                rg.TextQuestion(name="answer")
            ]
        )
   
        custom_ds = cls()
        for attr, value in vars(ds).items():
            setattr(custom_ds, attr, value)
        return custom_ds
    
    @classmethod
    def for_conversational(cls, system_prompt=False):
        """
        """
        ds = rg.FeedbackDataset(
            fields=[
                rg.TextField(name="user_prompt"),
                rg.TextField(name="response")
                ],
            questions=[
                rg.TextQuestion(name="answer")
            ]
        )
        if system_prompt:
            ds.fields.insert(0,
                rg.TextField(name="system_prompt")
            )
   
        custom_ds = cls()
        for attr, value in vars(ds).items():
            setattr(custom_ds, attr, value)
        return custom_ds

    @classmethod
    def for_rag(cls, retrieval_source=False):
        """
        """
        ds = rg.FeedbackDataset(
            fields=[
                rg.TextField(name="query"),
                rg.TextField(name="retrieved_document",
                             title="Retrieved Document")
                ],
            questions=[
                rg.TextQuestion(name="answer")
            ]
        )
        if retrieval_source:
            ds.fields.append(
                rg.TextField(name="retrieval_source",
                             title="Retrieval Source")
            )
   
        custom_ds = cls()
        for attr, value in vars(ds).items():
            setattr(custom_ds, attr, value)
        return custom_ds
    
    @classmethod
    def for_sentence_similarity(cls):
        """
        """
        ds = rg.FeedbackDataset(
            fields=[
                rg.TextField(name="premise"),
                rg.TextField(name="hypothesis")
                ],
            questions=[
                rg.LabelQuestion(name="relationship",
                                 labels=["entailment", "neutral", "contradiction"])

            ]
        )
        custom_ds = cls()
        for attr, value in vars(ds).items():
            setattr(custom_ds, attr, value)
        return custom_ds


SENTENCE SIMILARITY

In [43]:
dataset_sent_sim = load_dataset("plaguss/snli-small", split="train")
dict_sent_sim = {"premise": [dataset_sent_sim["premise"][i] for i in range(len(dataset_sent_sim["premise"]))], "hypothesis": [dataset_sent_sim["hypothesis"][i] for i in range(len(dataset_sent_sim["hypothesis"]))]}
ds_sent_sim = FeedbackDs.for_sentence_similarity()
ds_sent_sim.add_item(dict_sent_sim)

RAG

In [41]:
dict_rag = {"query": ["What is the status of the war?"], "retrieved_document": ["The war is over."], "retrieval_source": ["wikipedia"]}
ds_rag = FeedbackDs.for_rag(retrieval_source=True)
ds_rag.add_item(dict_rag)

CONVERSATIONAL

In [38]:
conversational_hf = load_dataset("georgesung/OpenOrca_35k", split="train").shard(index=1, num_shards=1110)
dict_conversational_wo_systemprompt = {"user_prompt": [conversational_hf["question"][i] for i in range(len(conversational_hf["question"]))], "response": [conversational_hf["response"][i] for i in range(len(conversational_hf["response"]))]}
dict_conversational_w_systemprompt = {"user_prompt": [conversational_hf["question"][i] for i in range(len(conversational_hf["question"]))], "response": [conversational_hf["response"][i] for i in range(len(conversational_hf["response"]))], "system_prompt": [conversational_hf["system_prompt"][i] for i in range(len(conversational_hf["system_prompt"]))]}

In [39]:
ds_conversational = FeedbackDs.for_conversational(system_prompt=True)
ds_conversational.add_item(dict_conversational_w_systemprompt)

SUPERVISED FINETUNING

In [36]:
ds_supervised = FeedbackDs.for_supervised_fine_tuning(["instruction", "context", "response"])
ds_supervised.add_item({"instruction": ["Hello_instruction"], "context": ["Hello_context"], "response": ["Hello_response"]})

QA 

In [25]:
data_items = {"question": ["qq1"], "context": ["cc1"]}
ds_qa = FeedbackDs.for_question_answering()
ds_qa.add_item(data_items)

SUMMARIZATION

In [30]:
sum_ds = load_dataset("ccdv/govreport-summarization", split="train").shard(index=0, num_shards=1000)
dict_summarization_wo_summary = {"text": [sum_ds["report"][i] for i in range(len(sum_ds["report"]))]}
dict_summarization_w_summary = {"text": [sum_ds["report"][i] for i in range(len(sum_ds["report"]))], "example": [sum_ds["summary"][i] for i in range(len(sum_ds["summary"]))]}

In [31]:
labels_sum = ["pos", "neg"]
ds_summarization = FeedbackDs.for_summarization(labels=labels_sum, examples=False)
ds_summarization.add_item(dict_summarization_wo_summary)

TEXTCAT

In [27]:
textcat_items = {"text": ["I love this movie", "I hate this movie","I love it","erwer"]}
labels = ["pos", "neg"]
ds_textcat = FeedbackDs.for_text_classification(labels)
ds_textcat.add_item(textcat_items)

TEXTCAT2

In [21]:
ds_banking = load_dataset("banking77", split="train")
dict_banking = {"text": [ds_banking["text"][i] for i in range(len(ds_banking["text"]))]}



In [28]:
labels = ["pos", "neg", "neu"]
ds_textcat2 = FeedbackDs.for_text_classification(labels, multi_label=True)
ds_textcat2.add_item(dict_banking)

TRANSLATION

In [3]:
translation_hf = load_dataset("abidlabs/test-translation-dataset", split="train")
dict_translation = {"text": [translation_hf["Input"][i] for i in range(len(translation_hf["Input"]))]} #, "example": [translation_hf["Translation"][i] for i in range(len(translation_hf["Translation"]))]}

In [4]:
ds_translation = FeedbackDataset.for_translation(labels=False)
ds_translation.add_item(dict_translation)

UPLOAD

In [5]:
remote_ds = ds_translation.push_to_argilla(name="try37", workspace="admin")

Pushing records to Argilla...: 100%|██████████| 2/2 [00:00<00:00, 28.21it/s]


In [3]:
ds0 = rg.FeedbackDataset.for_question_answering()
ds0.fields[0].use_markdown = True

In [7]:
rg.FeedbackDataset.for_translation()

<FeedbackDataset fields=[TextField(name='text', title='Text', required=True, type='text', use_markdown=False)] questions=[TextQuestion(name='translation', title='Translation', description=None, required=True, type='text', use_markdown=False)] guidelines=None>

In [4]:
ds0.add_records(rg.FeedbackRecord(fields={"question": "qq1", "context": "cc1"}))

In [13]:
ds0 = rg.FeedbackDataset.for_sentence_similarity()

for_sentence_similarity
