In [1]:
from datasets import load_dataset

In [2]:
ds = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train")

In [3]:
import argilla as rg

In [4]:
df = ds.shuffle(seed=422).select(range(30)).select_columns(["lang", "seed", "problem"]).to_pandas()

In [5]:
df = ds.shuffle(seed=1).select(range(30)).select_columns(["lang", "seed", "problem", "solution"]).to_pandas()
idx = 0
print(df["seed"][idx].strip())
print("__________")
print(df["problem"][idx])
print("__________")
print(df["solution"][idx])


//-------------------------------------------------
// Implementation of class BRequest_RemoteArrayTypes23_sendArraysObject
// Generated from class byps.gen.cpp.GenApiClass
__________
You are tasked with implementing a class that manages remote array types in a distributed system. The class `BRequest_RemoteArrayTypes23_sendArraysObject` is generated from a template using a code generation tool. Your task is to create a method within this class that efficiently sends arrays of objects to a remote server. 

Your method should take an array of objects as input and send it to the remote server using the appropriate communication protocol. You should also handle any potential errors that may occur during the communication process.

Your implementation should be efficient and robust, ensuring that the arrays of objects are transmitted accurately and reliably to the remote server.

Write a method `sendArraysObject` within the `BRequest_RemoteArrayTypes23_sendArraysObject` class that fulfills 

In [6]:
from distilabel.pipeline import Pipeline
from distilabel.llm import OpenAILLM
from distilabel.tasks import TextGenerationTask
from dataclasses import dataclass
from distilabel.tasks import Prompt
from typing import List, Dict
import os

In [51]:
oss_instruct_prompt = """Please gain inspiration from the following random code snippet to create a high-quality programming problem. Present your output in two distinct sections:
[Problem Description] and [Solution].

Code snippet for inspiration:
```
{code}
```

Guidelines for each section:
1. [Problem Description]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.

2. [Solution]: Offer a comprehensive, **correct** solution that accurately addresses the [Problem Description] you provided."""

@dataclass
class OSSInstruct(TextGenerationTask):
    system_prompt: str = "You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions."

    def generate_prompt(self, input: str) -> Prompt:
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=oss_instruct_prompt.format(code=input)
          )

    def parse_output(self, output: str) -> List[Dict[str, str]]:
        problem, solution = output.split("[Solution]")
        return {
            "problem": problem.replace("[Problem Description]", "").strip(),
            "solution": solution.strip()
        }

## Alternative definition of OSS Instruct

Just generate the problem, this way it's more scalable.

*In case it's not enough, play with an extra guide like the following:*

`the provided code snippet it's only used as an inspiration to create a programming problem. If you reference it, you must add it to the problem description`

In [43]:
oss_instruct_prompt = """Please gain inspiration from the following random code snippet to create a high-quality programming problem.

Code snippet for inspiration:
```
{code}
```

Guidelines for the problem:
The problem should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are **explicitly included*. **Don't reference any provided code snippet** if you are not including it in the problem description."""


@dataclass
class OSSInstruct(TextGenerationTask):
    system_prompt: str = "You are exceptionally skilled at crafting high-quality programming problems."

    def generate_prompt(self, input: str) -> Prompt:
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=oss_instruct_prompt.format(code=input)
          )

    def parse_output(self, output: str) -> List[Dict[str, str]]:
        return {"problem": re.sub("Problem:?", "", output).strip()}


In [44]:
pipe_generation = Pipeline(
    generator=OpenAILLM(
        model="gpt-3.5-turbo",
        task=OSSInstruct(),
        api_key=os.getenv("OPENAI_API_KEY"),
        max_new_tokens=1024,
        num_threads=4,
        temperature=1
    )
)

In [46]:
subset = ds.shuffle(seed=422).select(range(30))

In [47]:
subset

Dataset({
    features: ['lang', 'raw_index', 'index', 'seed', 'openai_fingerprint', 'problem', 'solution'],
    num_rows: 30
})

In [48]:
distilabel_subset = subset.select_columns(["seed"]).rename_column("seed", "input")

In [49]:
distilabel_subset[0]

{'input': 'def preprocess(data, lang):\n    data["arg_negate"] = parse_template_boolean_value(data, parameter="arg_negate", default_value=False)\n    data["arg_is_regex"] = parse_template_boolean_value(data, parameter="arg_is_regex", default_value=False)\n    return data\n'}

In [50]:
oss_instruct_ds = pipe_generation.generate(
    dataset=distilabel_subset,
    num_generations=1,
    batch_size=8,
    checkpoint_strategy=None,
)

Flattening the indices: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 202.11 examples/s]


In [51]:
oss_instruct_ds.to_pandas().head()

Unnamed: 0,input,generation_model,generation_prompt,raw_generation_responses,problem,generations
0,"def preprocess(data, lang):\n data[""arg_neg...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Problem:\n\nYou are given a function `preproc...,"[You are given a function `preprocess(data, la...",
1,bash start-datafari.sh\n,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Problem:\n\nYou have been asked to automate t...,[You have been asked to automate the deploymen...,
2,"sa.Column('is_deleted', sa.Boolean(), null...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Problem Title: Event Synchronization\n\n## Pr...,[Title: Event Synchronization\n\n## Descripti...,
3,import com.google.common.collect.Lists;\nimpor...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Problem Title: Tunnel Aspect Registry\n\nProb...,[Title: Tunnel Aspect Registry\n\n Description...,
4,//\n// This is an auto-generated file.\n//\n\n...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Problem:\n\nYou are given a struct called `Co...,[You are given a struct called `Course` which ...,


In [41]:
#print(oss_instruct_ds[2]["problem"][0].replace("Problem", "").replace("Problem:", "").strip())
import re
re.sub("Problem:?", "", oss_instruct_ds[4]["problem"][0]).strip()


'You are given a struct called `Course` which represents a single course offered at a university. It has the following properties:\n\n- `id`: An integer representing the unique identifier for the course.\n- `name`: A string representing the name of the course.\n- `courseCode`: A string representing the code assigned to the course.\n- `accountId`: An integer representing the account to which the course belongs.\n\nYour task is to implement a function called `findCourseByCode` that takes in two parameters:\n\n1. `courses`: An array of `Course` objects representing a list of courses.\n2. `code`: A string representing the course code to be searched for.\n\nThe function should return an array of `Course` objects that match the given course code.\n\n**Function Signature:** `func findCourseByCode(courses: [Course], code: String) -> [Course]`\n\n**Input:**\nThe function `findCourseByCode` takes in two parameters:\n- `courses` (1 <= courses.count <= 10^5): An array of `Course` objects represent

In [81]:
#rg_dataset_0 = oss_instruct_ds.to_argilla()
#rg_dataset_og = oss_instruct_og.to_argilla()
#rg_dataset.push_to_argilla(name=name, workspace=workspace)

In [78]:
from huggingface_hub import duplicate_space

import uuid

In [82]:
from_id = "argilla/argilla-template-space"  # default template
to_id = f"distilabel-dataset-{str(uuid.uuid4()).split('-')[0]}"  # New id of the dataset, will reuse the user, otherwise update to your HF account
new_space = duplicate_space(from_id, to_id=to_id)

In [60]:
argilla_api_key = "admin.apikey"
argilla_space_url = f"https://{new_space.namespace}-{to_id}.hf.space"

In [61]:
argilla_space_url = 'https://plaguss-distilabel-dataset-cddf4f3e.hf.space'

In [62]:
import argilla as rg

default_workspace = "admin"

rg.init(
    api_key=argilla_api_key,
    api_url=argilla_space_url,
    workspace=default_workspace
)

This may lead to potential compatibility issues during your experience.
To ensure a seamless and optimized connection, we highly recommend aligning your client version with the server version.


In [57]:
import argilla as rg
#dataset = dstaset.select(range(10))

fields = [
    rg.TextField(name="seed", title="seed", use_markdown=True),
    rg.TextField(name="problem", title="problem", use_markdown=True),
#    rg.TextField(name="solution", title="solution", use_markdown=True)
]
questions = [
    rg.TextQuestion(
        name="explorer",
        title="Review the fields of the dataset.",
        required=True,
        use_markdown=True
    )
]

# Create dataset
rg_dataset = rg.FeedbackDataset(
    fields=fields,
    questions=questions,
)


In [89]:
oss_instruct_ds.column_names

['input',
 'generation_model',
 'generation_prompt',
 'raw_generation_responses',
 'problem',
 'solution',
 'generations']

In [90]:
oss_instruct_ds.select_columns(["input", "problem", "solution"])[0]

{'input': 'def preprocess(data, lang):\n    data["arg_negate"] = parse_template_boolean_value(data, parameter="arg_negate", default_value=False)\n    data["arg_is_regex"] = parse_template_boolean_value(data, parameter="arg_is_regex", default_value=False)\n    return data\n',
 'problem': ['You are given a code snippet that defines a function `preprocess(data, lang)`. The function takes two parameters:\n- `data`: A dictionary representing some data.\n- `lang`: A string representing the language of the data.\n\nThe `preprocess` function performs some preprocessing on the input `data`. It adds two new key-value pairs to the dictionary `data`:\n- Key: `"arg_negate"`, Value: The result of calling a function `parse_template_boolean_value(data, parameter="arg_negate", default_value=False)`.\n- Key: `"arg_is_regex"`, Value: The result of calling a function `parse_template_boolean_value(data, parameter="arg_is_regex", default_value=False)`.\n\nYou need to implement the `preprocess` function to c

In [55]:
oss_instruct_ds["generations"]

Dataset({
    features: ['input', 'generation_model', 'generation_prompt', 'raw_generation_responses', 'problem', 'generations'],
    num_rows: 30
})

In [58]:
# Add rows
for row in oss_instruct_ds:  # oss_instruct_og
    try:
        fields = {
            "seed": row["input"],
            "problem": row["problem"][0],
#            "solution": row["solution"][0],
        }
        suggestions = []
        if "chosen_rationale" in oss_instruct_ds.column_names and "rejected_rationale" in oss_instruct_ds.column_names:
            suggestions = [
                {
                    "question_name": "chosen-rationale",
                    "value": row["chosen_rationale"]
                },
                {
                    "question_name": "rejected-rationale",
                    "value": row["rejected_rationale"]
                }
            ]
        rg_dataset.add_records(rg.FeedbackRecord(fields=fields, suggestions=suggestions))
    except Exception as e:
        print(e)

In [63]:
rg_dataset.push_to_argilla(name="disticoder-problem-set", workspace="admin")

RemoteFeedbackDataset(
   id=f000c8a4-4b60-4bd7-bfc5-86cf15d4721c
   name=disticoder-problem-set
   workspace=Workspace(id=61543500-24a8-4a5e-b91d-65c7ce541c6e, name=admin, inserted_at=2024-02-16 11:48:08.311578, updated_at=2024-02-16 11:48:08.311578)
   url=https://plaguss-distilabel-dataset-cddf4f3e.hf.space/dataset/f000c8a4-4b60-4bd7-bfc5-86cf15d4721c/annotation-mode
   fields=[RemoteTextField(id=UUID('9e0bf926-3129-4179-8d44-09ef21d49e11'), client=None, name='seed', title='seed', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('26e91794-f1b6-4a3f-8760-430802a706aa'), client=None, name='problem', title='problem', required=True, type='text', use_markdown=True)]
   questions=[RemoteTextQuestion(id=UUID('223cff33-e130-4d4b-a6b6-2ebe720fba0d'), client=None, name='explorer', title='Review the fields of the dataset.', description=None, required=True, type='text', use_markdown=True)]
   guidelines=None
   metadata_properties=[]
   vectors_settings=[]
)

### Generate the solution

In [65]:
oss_instruct_ds

Dataset({
    features: ['input', 'generation_model', 'generation_prompt', 'raw_generation_responses', 'problem', 'generations'],
    num_rows: 30
})

In [95]:
problems_oss_instruct = oss_instruct_ds.rename_column("input", "seed")
problems_oss_instruct = problems_oss_instruct.rename_column("problem", "input")
problems_oss_instruct = problems_oss_instruct.map(lambda x: {"input": x["input"][0]})
problems_oss_instruct = problems_oss_instruct.remove_columns(['generation_model', 'generation_prompt', 'raw_generation_responses', 'generations'])

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 2251.45 examples/s]


In [86]:
problems_oss_instruct.to_pandas().head()

Unnamed: 0,seed,input
0,"def preprocess(data, lang):\n data[""arg_neg...","You are given a function `preprocess(data, lan..."
1,bash start-datafari.sh\n,You have been asked to automate the deployment...
2,"sa.Column('is_deleted', sa.Boolean(), null...",Title: Event Synchronization\n\n## Descriptio...
3,import com.google.common.collect.Lists;\nimpor...,Title: Tunnel Aspect Registry\n\n Description:...
4,//\n// This is an auto-generated file.\n//\n\n...,You are given a struct called `Course` which r...


In [93]:
oss_instruct_prompt = """Please gain inspiration from the following random code snippet to create a high-quality programming problem.

Code snippet for inspiration:
```
{code}
```

Guidelines for the problem:
The problem should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are **explicitly included*. **Don't reference any provided code snippet** if you are not including it in the problem description."""


@dataclass
class OSSolution(TextGenerationTask):
    system_prompt: str = (
        "You are exceptionally skilled at code generation and problem solving. ",
        "Offer a comprehensive, **correct** solution that accurately addresses the problem provided."
    )

#    def generate_prompt(self, input: str) -> Prompt:
#        return Prompt(
#            system_prompt=self.system_prompt,
#            formatted_prompt=oss_instruct_prompt.format(code=input)
#          )

    def parse_output(self, output: str) -> List[Dict[str, str]]:
        return {"solution": output}



pipe_generation_solutions = Pipeline(
    generator=OpenAILLM(
        model="gpt-3.5-turbo",
        task=OSSSolution(),
        api_key=os.getenv("OPENAI_API_KEY"),
        max_new_tokens=1024,
        num_threads=4,
        temperature=1
    )
)


In [80]:
problems_oss_instruct.select(range(4)).to_pandas()

Unnamed: 0,seed,input
0,"def preprocess(data, lang):\n data[""arg_neg...","[You are given a function `preprocess(data, la..."
1,bash start-datafari.sh\n,[You have been asked to automate the deploymen...
2,"sa.Column('is_deleted', sa.Boolean(), null...",[Title: Event Synchronization\n\n## Descripti...
3,import com.google.common.collect.Lists;\nimpor...,[Title: Tunnel Aspect Registry\n\n Description...


In [107]:
problems_oss_instruct.select(range(4)).to_pandas()["input"].iloc[4]

In [100]:
solutions_test = pipe_generation_solutions.generate(
    dataset=problems_oss_instruct.select(range(4)),
    num_generations=1,
    batch_size=8,
    checkpoint_strategy=None,
)

In [91]:
print(solutions_test["generations"][0][0])

Here is an implementation of the `parse_template_boolean_value` function:

```python
def parse_template_boolean_value(data, parameter, default_value):
    if parameter in data:
        return data[parameter]
    else:
        return default_value
```

The function first checks if the `parameter` is present in the `data` dictionary using the `in` operator. If the `parameter` is present, it returns its corresponding value from the `data` dictionary. Otherwise, it returns the `default_value` provided as an argument.
