In [None]:
!pip install datasets .

In [None]:
import argilla.v1 as rg_v1

In [None]:
import os
url = "https://demo.argilla.io"
api_key = "<your-api-key>"

In [None]:
rg_v1.init(url, api_key)

In [None]:
dataset_name = "news-programmatic-labeling"
workspace = "demo"

Read the current dataset labels

In [None]:
settings_v1 = rg_v1.load_dataset_settings(dataset_name, workspace)
settings_v1

Read the dataset records


In [None]:
hf_records = rg_v1.load(dataset_name, workspace, limit=100, query="_exists_:annotated_by").to_datasets()
hf_records

In [None]:
hf_records[0]

In [None]:
import argilla as rg

client = rg.Argilla() # create the new dataset into a different argilla server instance 

In [None]:
settings = rg.Settings(
    fields=[
        # The default name for text classification is `text`,
        #  but we should provide all names included in `record.inputs`
        rg.TextField(name="text"),
    ],
    questions=[
        # The basis question for text classification is a LabelQuestion for single-label
        # or MultiLabelQuestion for multi-label classification
        rg.LabelQuestion(name="label", labels=settings_v1.label_schema),
    ],
    metadata=[
        # Here, we need to provide all relevant metadata fields.
        rg.TermsMetadataProperty(name="split"),
    ],
    vectors=[
        # The vectors fields available in the dataset
        rg.VectorField(name='mini-lm-sentence-transformers', dimensions=384),
    ],
)
settings

In [None]:
ds = client.datasets(name=dataset_name)
if ds.exists():
    ds.delete()

dataset = rg.Dataset(name=dataset_name, settings=settings)
dataset.create()


In [None]:
# For assign responses to users, we need to load the existing users 
users_by_name = {user.username: user for user in client.users}
current_user = client.me


Now, we can upload the records to the new dataset. For that, we can convert the loaded records into a hf dataset which can be uploaded to the new dataset

In [None]:
def map_to_record_for_single_label(data: dict, users_by_name: dict, current_user: rg.User) -> rg.Record:
    """ This function maps a text classification record dictionary to the new Argilla record."""
    suggestions = []
    responses = []
    vectors = []
    if data.get("prediction"):
        # From data["prediction"]
        label, score = data["prediction"][0].values()
        agent = data.get("prediction_agent")
        suggestions.append(rg.Suggestion(question_name="label", value=label, score=score, agent=agent))
    if data.get("annotation"):
        # From data[annotation] and data[annotation_agent]
        user_id = users_by_name.get(data["annotation_agent"], current_user).id
        responses.append(rg.Response(question_name="label", value=data["annotation"], user_id=user_id))
    if data.get("vectors"):
        # From data["vectors"]
        vectors = [rg.Vector(name=name, values=value) for name, value in data["vectors"].items()]

    return rg.Record(
        id=data["id"],
        fields=data["inputs"],
        # The inputs field should be a dictionary with the same keys as the `fields` in the settings
        metadata=data["metadata"],
        # The metadata field should be a dictionary with the same keys as the `metadata` in the settings
        vectors=vectors,
        suggestions=suggestions,
        responses=responses,
    )

In [None]:
def map_to_record_for_multi_label(data: dict, users_by_name: dict, current_user: rg.User) -> rg.Record:
    suggestions = []
    responses = []
    vectors = []
    if data.get("prediction"):
        # From data["prediction"]
        labels = [label["label"] for label in data["prediction"]]
        scores = [label["score"] for label in data["prediction"]]
        agent = data.get("prediction_agent")
        suggestions.append(rg.Suggestion(question_name="labels", value=labels, score=scores, agent=agent))
    if data.get("annotation"):
        # From data[annotation] and data[annotation_agent]
        user_id = users_by_name.get(data["annotation_agent"], current_user).id
        responses.append(rg.Response(question_name="label", value=data["annotation"], user_id=user_id))

    if data.get("vectors"):
        # From data["vectors"]
        vectors = [rg.Vector(name=name, values=value) for name, value in data["vectors"].items()]


    return rg.Record(
        id=data["id"],
        fields=data["inputs"],
        # The inputs field should be a dictionary with the same keys as the `fields` in the settings
        metadata=data["metadata"],
        # The metadata field should be a dictionary with the same keys as the `metadata` in the settings
        vectors=vectors,
        # The vectors field should be a dictionary with the same keys as the `vectors` in the settings
        suggestions=suggestions,
        responses=responses,
    )


In [None]:
records = []
for data in hf_records:
    records.append(map_to_record_for_single_label(data, users_by_name, current_user))

# 4. Upload the records to the new dataset
dataset.records.log(records)

## Notes about the migration workflow

1. We only need a subset of functions from v1 SDK
    - `rg.init`
    - `rg.load_dataset_settings`
    - `rg.load`
   But, event like this, it would be nice to provide the whole SDK, so users can migrate iteratively their code to the v2 version.
2. Users should have a clear notion of the dataset structure (extra inputs, vectors, or metadata).
3. Created responses have a Draft status by default. There is no way to preset the status for a response, which can be blocking, even more if bulk operations in UI cannot be applied to the Draft queue.
4. When creating a rg.Record we need to prepare all about suggestions and responses before create the record itself. It could help if we can add responses or suggestions to an existing record (record.suggestions.add(...), record.responses.add(...))