In [3]:
from datasets import load_dataset

In [4]:
ds = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train")

Downloading readme:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/203M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
df = ds.shuffle(seed=422).select(range(30)).select_columns(["lang", "seed", "problem"]).to_pandas()

In [43]:
df = ds.shuffle(seed=1).select(range(30)).select_columns(["lang", "seed", "problem", "solution"]).to_pandas()
idx = 0
print(df["seed"][idx].strip())
print("__________")
print(df["problem"][idx])
print("__________")
print(df["solution"][idx])


//-------------------------------------------------
// Implementation of class BRequest_RemoteArrayTypes23_sendArraysObject
// Generated from class byps.gen.cpp.GenApiClass
__________
You are tasked with implementing a class that manages remote array types in a distributed system. The class `BRequest_RemoteArrayTypes23_sendArraysObject` is generated from a template using a code generation tool. Your task is to create a method within this class that efficiently sends arrays of objects to a remote server. 

Your method should take an array of objects as input and send it to the remote server using the appropriate communication protocol. You should also handle any potential errors that may occur during the communication process.

Your implementation should be efficient and robust, ensuring that the arrays of objects are transmitted accurately and reliably to the remote server.

Write a method `sendArraysObject` within the `BRequest_RemoteArrayTypes23_sendArraysObject` class that fulfills 

In [52]:
from distilabel.pipeline import Pipeline
from distilabel.llm import OpenAILLM
from distilabel.tasks import TextGenerationTask
from dataclasses import dataclass
from distilabel.tasks import Prompt
from typing import List, Dict
import os

In [51]:
oss_instruct_prompt = """Please gain inspiration from the following random code snippet to create a high-quality programming problem. Present your output in two distinct sections:
[Problem Description] and [Solution].

Code snippet for inspiration:
```
{code}
```

Guidelines for each section:
1. [Problem Description]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.

2. [Solution]: Offer a comprehensive, **correct** solution that accurately addresses the [Problem Description] you provided."""

@dataclass
class OSSInstruct(TextGenerationTask):
    system_prompt: str = "You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions."

    def generate_prompt(self, input: str) -> Prompt:
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=oss_instruct_prompt.format(code=input)
          )

    def parse_output(self, output: str) -> List[Dict[str, str]]:
        problem, solution = output.split("[Solution]")
        return {
            "problem": problem.replace("[Problem Description]", "").strip(),
            "solution": solution.strip()
        }

In [71]:
pipe_generation = Pipeline(
    generator=OpenAILLM(
        model="gpt-3.5-turbo",
        task=OSSInstruct(),
        api_key=os.getenv("OPENAI_API_KEY"),
        max_new_tokens=1024,
        num_threads=4,
        temperature=1
    )
)

pipe_generation_og = Pipeline(
    generator=OpenAILLM(
        model="gpt-3.5-turbo-1106",
        task=OSSInstruct(),
        api_key=os.getenv("OPENAI_API_KEY"),
        max_new_tokens=1024,
        num_threads=4,
        temperature=1
    )
)

In [58]:
subset = ds.shuffle(seed=422).select(range(30))

In [68]:
distilabel_subset = subset.select_columns(["seed"]).rename_column("seed", "input")

In [70]:
distilabel_subset[0]

{'input': 'def preprocess(data, lang):\n    data["arg_negate"] = parse_template_boolean_value(data, parameter="arg_negate", default_value=False)\n    data["arg_is_regex"] = parse_template_boolean_value(data, parameter="arg_is_regex", default_value=False)\n    return data\n'}

In [72]:
oss_instruct_ds = pipe_generation.generate(
    dataset=distilabel_subset,
    num_generations=1,
    batch_size=8,
#    checkpoint_strategy=dataset_checkpoint_sft,
)

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Output()

Flattening the indices:   0%|          | 0/30 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/30 [00:00<?, ? examples/s]

In [103]:
oss_instruct_ds.to_pandas().head()

Unnamed: 0,input,generation_model,generation_prompt,raw_generation_responses,problem,solution,generations
0,"def preprocess(data, lang):\n data[""arg_neg...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[[Problem Description]\nYou are given a code s...,[You are given a code snippet that defines a f...,"[```python\ndef preprocess(data, lang):\n ""...",
1,bash start-datafari.sh\n,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[[Problem Description]\n\nYou need to write a ...,[You need to write a Bash script that starts t...,[```bash\n#!/bin/bash\n\n# Check if the Datafa...,
2,"sa.Column('is_deleted', sa.Boolean(), null...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[[Problem Description]\nYou have been given a ...,[You have been given a snippet of code that de...,[```\ndef analyze_table_structure(code_snippet...,
3,import com.google.common.collect.Lists;\nimpor...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[[Problem Description]\n\nYou are working on a...,[You are working on a project that involves in...,[```java\nimport com.google.common.collect.Lis...,
4,//\n// This is an auto-generated file.\n//\n\n...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[[Problem Description]\nYou are given a code s...,[You are given a code snippet that defines a s...,[A straightforward solution to this problem is...,


In [76]:
oss_instruct_og = pipe_generation_og.generate(
    dataset=distilabel_subset,
    num_generations=1,
    batch_size=8,
#    checkpoint_strategy=dataset_checkpoint_sft,
)

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/30 [00:00<?, ? examples/s]

In [81]:
#rg_dataset_0 = oss_instruct_ds.to_argilla()
#rg_dataset_og = oss_instruct_og.to_argilla()
#rg_dataset.push_to_argilla(name=name, workspace=workspace)

In [78]:
from huggingface_hub import duplicate_space

import uuid

In [82]:
from_id = "argilla/argilla-template-space"  # default template
to_id = f"distilabel-dataset-{str(uuid.uuid4()).split('-')[0]}"  # New id of the dataset, will reuse the user, otherwise update to your HF account
new_space = duplicate_space(from_id, to_id=to_id)

In [83]:
argilla_api_key = "admin.apikey"
argilla_space_url = f"https://{new_space.namespace}-{to_id}.hf.space"

In [102]:
argilla_space_url

'https://plaguss-distilabel-dataset-cddf4f3e.hf.space'

In [95]:
import argilla as rg

default_workspace = "admin"

rg.init(
    api_key=argilla_api_key,
    api_url=argilla_space_url,
    workspace=default_workspace
)

In [98]:
import argilla as rg
#dataset = dstaset.select(range(10))

fields = [
    rg.TextField(name="seed", title="seed", use_markdown=True),
    rg.TextField(name="problem", title="problem", use_markdown=True),
    rg.TextField(name="solution", title="solution", use_markdown=True)
]
questions = [
    rg.TextQuestion(
        name="explorer",
        title="Review the fields of the dataset.",
        required=True,
        use_markdown=True
    )
]

# Create dataset
rg_dataset = rg.FeedbackDataset(
    fields=fields,
    questions=questions,
)


In [89]:
oss_instruct_ds.column_names

['input',
 'generation_model',
 'generation_prompt',
 'raw_generation_responses',
 'problem',
 'solution',
 'generations']

In [90]:
oss_instruct_ds.select_columns(["input", "problem", "solution"])[0]

{'input': 'def preprocess(data, lang):\n    data["arg_negate"] = parse_template_boolean_value(data, parameter="arg_negate", default_value=False)\n    data["arg_is_regex"] = parse_template_boolean_value(data, parameter="arg_is_regex", default_value=False)\n    return data\n',
 'problem': ['You are given a code snippet that defines a function `preprocess(data, lang)`. The function takes two parameters:\n- `data`: A dictionary representing some data.\n- `lang`: A string representing the language of the data.\n\nThe `preprocess` function performs some preprocessing on the input `data`. It adds two new key-value pairs to the dictionary `data`:\n- Key: `"arg_negate"`, Value: The result of calling a function `parse_template_boolean_value(data, parameter="arg_negate", default_value=False)`.\n- Key: `"arg_is_regex"`, Value: The result of calling a function `parse_template_boolean_value(data, parameter="arg_is_regex", default_value=False)`.\n\nYou need to implement the `preprocess` function to c

In [99]:
# Add rows
for row in oss_instruct_og:  # oss_instruct_og
    try:
        fields = {
            "seed": row["input"],
            "problem": row["problem"][0],
            "solution": row["solution"][0],
        }
        suggestions = []
        if "chosen_rationale" in oss_instruct_ds.column_names and "rejected_rationale" in oss_instruct_ds.column_names:
            suggestions = [
                {
                    "question_name": "chosen-rationale",
                    "value": row["chosen_rationale"]
                },
                {
                    "question_name": "rejected-rationale",
                    "value": row["rejected_rationale"]
                }
            ]
        rg_dataset.add_records(rg.FeedbackRecord(fields=fields, suggestions=suggestions))
    except Exception as e:
        print(e)

In [101]:
rg_dataset.push_to_argilla(name="oss_instruct_gpt-3p5-1106", workspace="admin")

Output()

RemoteFeedbackDataset(
   id=c157097d-7ed0-471b-a8fa-db42e339016a
   name=oss_instruct_gpt-3p5-1106
   workspace=Workspace(id=61543500-24a8-4a5e-b91d-65c7ce541c6e, name=admin, inserted_at=2024-02-16 11:48:08.311578, updated_at=2024-02-16 11:48:08.311578)
   url=https://plaguss-distilabel-dataset-cddf4f3e.hf.space/dataset/c157097d-7ed0-471b-a8fa-db42e339016a/annotation-mode
   fields=[RemoteTextField(id=UUID('0140ea76-c66d-4329-9f79-177634824881'), client=None, name='seed', title='seed', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('864237a9-1ef6-44a6-a103-32b0551c092b'), client=None, name='problem', title='problem', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('63c6847b-30ff-4c1c-a223-606c4459a137'), client=None, name='solution', title='solution', required=True, type='text', use_markdown=True)]
   questions=[RemoteTextQuestion(id=UUID('364f4976-d8e9-4c00-8294-95f551ef5657'), client=None, name='explorer', title='Review the fields of 