In [2]:
from datasets import load_dataset

In [3]:
ds = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train")

In [4]:
import argilla as rg

In [28]:
ds

Dataset({
    features: ['lang', 'raw_index', 'index', 'seed', 'openai_fingerprint', 'problem', 'solution'],
    num_rows: 75197
})

In [30]:
testing_size = 20
seed = 422

In [31]:
df = ds.shuffle(seed=seed).select(range(testing_size)).select_columns(["lang", "seed", "problem"]).to_pandas()

### Grab a sample of the original dataset with the same proportions

In [127]:
df = ds.shuffle(seed=seed).select_columns(["lang", "seed", "problem"]).to_pandas()

In [145]:
from datasets import Dataset

new_dataset_size = 5100
weights = (df["lang"].value_counts() / df["lang"].value_counts().sum())
weights = df["lang"].value_counts()
df_sampled = df.set_index("lang").sample(
    n=new_dataset_size,
    random_state=seed
).reset_index()
ds_sampled = Dataset.from_pandas(df_sampled, preserve_index=False)
df_sampled.reset_index()["lang"].value_counts() / new_dataset_size


python        0.511569
cpp           0.064314
rust          0.064314
swift         0.062353
shell         0.061569
java          0.060588
csharp        0.059804
typescript    0.058824
php           0.056667
Name: lang, dtype: float64

In [140]:
ds_sampled

Dataset({
    features: ['seed', 'problem'],
    num_rows: 5100
})

In [146]:
df = ds_sampled.shuffle(seed=seed).select(range(testing_size)).select_columns(["lang", "seed", "problem"]).to_pandas()
#df = ds.shuffle(seed=seed).select(range(testing_size)).select_columns(["lang", "seed", "problem", "solution"]).to_pandas()
idx = 0
print(df["seed"][idx].strip())
print("__________")
print(df["problem"][idx])
#print("__________")
#print(df["solution"][idx])


#print out all the node's first names
print('\n Printing first name from all nodes:')
print(g.V().first_name.toList()) 

#print out all the properties of person whose's first name is Shane
print('\n Printing all properties of person whose first name is Shane:')
print(g.V().has('person','first_name','Shane').valueMap().next()) 

#traversing the graph starting with Betty to then Shane to then Edith
print('\n Finding Betty and then looking up her parents:')
print(g.V().has('first_name', 'Betty').out('daughter_of').out('son_of').valueMap().toList())
__________
You are working on a project to model a family tree using a graph database. The graph database contains nodes representing individuals and edges representing relationships such as parent-child connections. Each node has properties like first name, last name, gender, and other relevant details. You are tasked with writing a program to perform various operations on this graph database using a graph traversal language like Gremlin.

You

In [1]:
from distilabel.pipeline import Pipeline
from distilabel.llm import OpenAILLM
from distilabel.tasks import TextGenerationTask
from dataclasses import dataclass
from distilabel.tasks import Prompt
from typing import List, Dict
import os

In [51]:
oss_instruct_prompt = """Please gain inspiration from the following random code snippet to create a high-quality programming problem. Present your output in two distinct sections:
[Problem Description] and [Solution].

Code snippet for inspiration:
```
{code}
```

Guidelines for each section:
1. [Problem Description]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.

2. [Solution]: Offer a comprehensive, **correct** solution that accurately addresses the [Problem Description] you provided."""

@dataclass
class OSSInstruct(TextGenerationTask):
    system_prompt: str = "You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions."

    def generate_prompt(self, input: str) -> Prompt:
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=oss_instruct_prompt.format(code=input)
          )

    def parse_output(self, output: str) -> List[Dict[str, str]]:
        problem, solution = output.split("[Solution]")
        return {
            "problem": problem.replace("[Problem Description]", "").strip(),
            "solution": solution.strip()
        }

## Alternative definition of OSS Instruct

Just generate the problem, this way it's more scalable.

*In case it's not enough, play with an extra guide like the following:*

`the provided code snippet it's only used as an inspiration to create a programming problem. If you reference it, you must add it to the problem description`

In [None]:
# First version
oss_instruct_prompt = """Please gain inspiration from the following random code snippet to create a high-quality programming problem.

Code snippet for inspiration:
```
{code}
```

Guidelines for the problem:
The problem should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are **explicitly included*. **Don't reference any provided code snippet** if you are not including it in the problem description."""

In [None]:
# Second version, trying to remove the Title from the generated problems
oss_instruct_prompt = """Please gain inspiration from the following random code snippet to create a high-quality programming problem.

Code snippet for inspiration:
```
{code}
```

Guidelines for the problem:
The problem should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem must be written as a natural question, avoid titles or anything that would make it artificial. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are **explicitly included*. **Don't reference any provided code snippet** if you are not including it in the problem description."""

In [164]:
oss_instruct_prompt = """Please gain inspiration from the following random code snippet to create a high-quality programming problem.

Code snippet for inspiration:
```
{code}
```

Guidelines for the problem:
The problem should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem must be written as a natural question, avoid titles or anything that would make it artificial. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are **explicitly included*. **Don't reference any provided code snippet** if you are not including it in the problem description."""

import re

@dataclass
class OSSInstructProblem(TextGenerationTask):
    system_prompt: str = "You are exceptionally skilled at crafting high-quality programming problems."

    def generate_prompt(self, input: str) -> Prompt:
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=oss_instruct_prompt.format(code=input)
          )

    def parse_output(self, output: str) -> List[Dict[str, str]]:
        return {"problem": re.sub("Problem:?", "", output).strip()}


In [165]:
pipe_generation = Pipeline(
    generator=OpenAILLM(
        model="gpt-3.5-turbo",
        task=OSSInstructProblem(),
        api_key=os.getenv("OPENAI_API_KEY"),
        max_new_tokens=1024,
        num_threads=4,
        temperature=1
    )
)

In [166]:
subset = ds.shuffle(seed=422).select(range(20))

In [167]:
subset

Dataset({
    features: ['lang', 'raw_index', 'index', 'seed', 'openai_fingerprint', 'problem', 'solution'],
    num_rows: 20
})

In [168]:
distilabel_subset = subset.select_columns(["seed"]).rename_column("seed", "input")

In [169]:
distilabel_subset[0]

{'input': 'def preprocess(data, lang):\n    data["arg_negate"] = parse_template_boolean_value(data, parameter="arg_negate", default_value=False)\n    data["arg_is_regex"] = parse_template_boolean_value(data, parameter="arg_is_regex", default_value=False)\n    return data\n'}

In [None]:
oss_problems

In [278]:
def load_dataset_snippets(seed: int = 422, nrows: int = 5100) -> Dataset:
    ds = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K", split="train")
    df_sampled = df.set_index("lang").sample(
        n=nrows,
        random_state=seed
    ).reset_index()
    ds_sampled = Dataset.from_pandas(df_sampled, preserve_index=False)
    return ds_sampled.select_columns(["seed"]).rename_column("seed", "input")


In [204]:
distilabel_subset[0]

{'input': 'def preprocess(data, lang):\n    data["arg_negate"] = parse_template_boolean_value(data, parameter="arg_negate", default_value=False)\n    data["arg_is_regex"] = parse_template_boolean_value(data, parameter="arg_is_regex", default_value=False)\n    return data\n'}

In [170]:
oss_instruct_ds = pipe_generation.generate(
    dataset=distilabel_subset,
    num_generations=1,
    batch_size=8,
    checkpoint_strategy=None,
)

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

In [171]:
oss_instruct_ds.to_pandas().head()

Unnamed: 0,input,generation_model,generation_prompt,raw_generation_responses,problem,generations
0,"def preprocess(data, lang):\n data[""arg_neg...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,"[Question:\n\nIn a web application project, yo...","[Question:\n\nIn a web application project, yo...",
1,bash start-datafari.sh\n,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[You are tasked with managing a large set of f...,[You are tasked with managing a large set of f...,
2,"sa.Column('is_deleted', sa.Boolean(), null...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,"[In a database system, there are two tables: `...","[In a database system, there are two tables: `...",
3,import com.google.common.collect.Lists;\nimpor...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[You have been tasked with optimizing the perf...,[You have been tasked with optimizing the perf...,
4,//\n// This is an auto-generated file.\n//\n\n...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Problem:\n\nA university is planning to creat...,[A university is planning to create a course r...,


In [36]:
import random

idx = random.randint(0, len(subset))
print("SEED:")
print(oss_instruct_ds[idx]["input"])
print("______" * 6)
print(oss_instruct_ds[idx]["problem"][0])

SEED:
})
export class WarrantyRepairPointMoreDetailComponent {
    @Input() model: PointModel = new PointModel();
}

____________________________________
Title: Adding Method to Point Model Class

You are working on a project that involves managing warranty repair points for various products. In the project, there is a `PointModel` class that represents a warranty repair point. The class has attributes such as ID, name, and location.

You need to add a new method to the `PointModel` class to calculate the distance between two warranty repair points. The distance between two points can be calculated using their geographic coordinates (latitude and longitude). Assume the Earth is a perfect sphere with a radius of 6371 km.

Write a Python method named `calculate_distance()` inside the `PointModel` class that takes another `PointModel` object as a parameter and returns the distance in kilometers between the current point and the specified point using the Haversine formula.

The Haversine f

In [148]:
from huggingface_hub import duplicate_space

import uuid

In [153]:
from_id = "argilla/argilla-template-space"  # default template
to_id = f"distilabel-dataset-{str(uuid.uuid4()).split('-')[0]}"  # New id of the dataset, will reuse the user, otherwise update to your HF account
#new_space = duplicate_space(from_id, to_id=to_id)

In [154]:
argilla_api_key = "admin.apikey"
argilla_space_url = f"https://{new_space.namespace}-{to_id}.hf.space"
argilla_space_url = 'https://plaguss-distilabel-dataset-cddf4f3e.hf.space'

In [155]:
import argilla as rg

default_workspace = "admin"

rg.init(
    api_key=argilla_api_key,
    api_url=argilla_space_url,
    workspace=default_workspace
)

This may lead to potential compatibility issues during your experience.
To ensure a seamless and optimized connection, we highly recommend aligning your client version with the server version.


In [172]:
import argilla as rg
#dataset = dstaset.select(range(10))

fields = [
    rg.TextField(name="seed", title="seed", use_markdown=True),
    rg.TextField(name="problem", title="problem", use_markdown=True),
#    rg.TextField(name="solution", title="solution", use_markdown=True)
]
questions = [
    rg.TextQuestion(
        name="explorer",
        title="Review the fields of the dataset.",
        required=True,
        use_markdown=True
    )
]

# Create dataset
rg_dataset = rg.FeedbackDataset(
    fields=fields,
    questions=questions,
)


In [179]:
print(oss_instruct_ds[6]["problem"][0])

****

You are tasked with creating a program to manage a library's collection of books. Your program should be able to perform various operations related to adding, searching, and managing the books in the library.

Implement a Python class called `Library` with the following requirements:

1. The constructor should initialize an empty list to store the books in the library.

2. Include a method `add_book` that takes in the title and author of a book as parameters and adds a new book entry to the library. Each book entry should be stored as a dictionary with the following keys: "title", "author".

3. Implement a method `search_books` that takes in a search query as a parameter and returns a list of books whose title or author matches the search query. The search should be case-insensitive.

4. Create a method `remove_book` that takes a book title as a parameter and removes the book with that title from the library, if it exists. If the book is successfully removed, return `True`; other

In [173]:
oss_instruct_ds.column_names

['input',
 'generation_model',
 'generation_prompt',
 'raw_generation_responses',
 'problem',
 'generations']

In [174]:
# Add rows
for row in oss_instruct_ds:  # oss_instruct_og
    try:
        fields = {
            "seed": row["input"],
            "problem": row["problem"][0],
#            "solution": row["solution"][0],
        }
        suggestions = []
        if "chosen_rationale" in oss_instruct_ds.column_names and "rejected_rationale" in oss_instruct_ds.column_names:
            suggestions = [
                {
                    "question_name": "chosen-rationale",
                    "value": row["chosen_rationale"]
                },
                {
                    "question_name": "rejected-rationale",
                    "value": row["rejected_rationale"]
                }
            ]
        rg_dataset.add_records(rg.FeedbackRecord(fields=fields, suggestions=suggestions))
    except Exception as e:
        print(e)



In [175]:
rg_dataset_name = "disticoder-problem-set-v2"
rg_dataset.push_to_argilla(name=rg_dataset_name, workspace="admin")

Output()

RemoteFeedbackDataset(
   id=8c6bc606-0d87-4f1d-a6a0-d6b66e19cb1b
   name=disticoder-problem-set-v2
   workspace=Workspace(id=61543500-24a8-4a5e-b91d-65c7ce541c6e, name=admin, inserted_at=2024-02-16 11:48:08.311578, updated_at=2024-02-16 11:48:08.311578)
   url=https://plaguss-distilabel-dataset-cddf4f3e.hf.space/dataset/8c6bc606-0d87-4f1d-a6a0-d6b66e19cb1b/annotation-mode
   fields=[RemoteTextField(id=UUID('8fa37b5d-16a7-4243-a446-d72b7632f53c'), client=None, name='seed', title='seed', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('1c67bbf3-363d-4e25-b5ff-723d21d8fb2f'), client=None, name='problem', title='problem', required=True, type='text', use_markdown=True)]
   questions=[RemoteTextQuestion(id=UUID('04c672b7-0e01-4423-acf7-6c5ed55c7691'), client=None, name='explorer', title='Review the fields of the dataset.', description=None, required=True, type='text', use_markdown=True)]
   guidelines=None
   metadata_properties=[]
   vectors_settings=[]
)

### Generate the solution

In [193]:
oss_instruct_ds

Dataset({
    features: ['input', 'generation_model', 'generation_prompt', 'raw_generation_responses', 'problem', 'generations'],
    num_rows: 20
})

In [270]:
oss_solution_prompt = """Offer a comprehensive, **correct** solution that accurately addresses the following problem:
{problem}"""


@dataclass
class OSSSolution(TextGenerationTask):
    system_prompt: str = "You are exceptionally skilled at code generation and problem solving."

    def generate_prompt(self, input: str) -> Prompt:
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=oss_solution_prompt.format(problem=input)
          )

    def parse_output(self, output: str) -> List[Dict[str, str]]:
        return {"solution": output}



pipe_generation_solutions = Pipeline(
    generator=OpenAILLM(
        model="gpt-3.5-turbo",
        task=OSSSolution(),
        api_key=os.getenv("OPENAI_API_KEY"),
        max_new_tokens=1024,
        num_threads=4,
        temperature=1
    )
)


In [281]:
from datasets import load_dataset
DATASET_NAME_PROBLEMS = "argilla/oss-instruct-problems-step-1"
oss_instruct_ds = load_dataset(DATASET_NAME_PROBLEMS, split="train")


In [288]:
oss_instruct_ds_new = (
        oss_instruct_ds
        .rename_column("input", "code_snippet")
        .rename_column("problem", "input")
        .map(lambda ex: {"input": ex["input"][0]}, remove_columns=["code_snippet"])
        .remove_columns("generations")
    )

Map:   0%|          | 0/5100 [00:00<?, ? examples/s]

In [289]:
oss_instruct_ds_new[0]["input"]

'**Problem:**\n\nYou are tasked with creating a program that simulates a bookstore\'s inventory management system. The system should allow the user to add new books to the inventory, search for books by title, author, or genre, update the quantity of available books, and display all the books currently in stock.\n\nYour task is to implement a class called `Book` with the following specifications:\n\n- Properties:\n  - `title`: a string representing the title of the book.\n  - `author`: a string representing the author of the book.\n  - `genre`: a string representing the genre of the book.\n  - `quantity`: an integer representing the quantity of this book in stock.\n\n- Methods:\n  - `__init__`: a constructor method that initializes the book\'s title, author, genre, and quantity.\n  - `__str__`: a method that returns a string representation of the book in the format: "Title: [title], Author: [author], Genre: [genre], Quantity: [quantity]".\n\nAdditionally, you need to implement a class 

In [299]:
cosa = load_dataset("argilla/oss-instruct-solutions-step-2", split="train")

Downloading readme:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5100 [00:00<?, ? examples/s]

In [311]:
cosa = load_dataset("argilla/oss-instruct-solutions-step-2-magicoder", split="train")

Downloading readme:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5100 [00:00<?, ? examples/s]

In [307]:
cosa.to_pandas().head()#["input"].isna().sum()

Unnamed: 0,code_snippet,generation_model,generation_prompt,raw_generation_responses,input,solution,generations
0,"\t\tsuper(debugName, CptLanguage.INSTANCE);\n\...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[**Problem:**\n\nYou are tasked with creating ...,**Problem:**\n\nYou are tasked with creating a...,[Here is a Python solution:\n\n```python\nclas...,
1,\t\t\t\t\t// расширенный поиск\n\t\t\t\t\tnew ...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[**Problem:**\n\nYou are working on developing...,**Problem:**\n\nYou are working on developing ...,[In order to implement such an advanced search...,
2,class TagView(ListView):\n model = Tag\n ...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[You are tasked with developing a blogging pla...,You are tasked with developing a blogging plat...,[The solution provides the correct implementat...,
3,export {};\n,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Imagine you are developing a new photo editin...,Imagine you are developing a new photo editing...,[Here is a correct solution in TypeScript:\n\n...,
4,\tdef initialize(self):\n\n\t\t# check communi...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Your task is to implement a Python class that...,Your task is to implement a Python class that ...,[You would accomplish this with the help of Py...,


In [312]:
cosa_df = cosa.to_pandas()
new_ds = Dataset.from_pandas(cosa_df[cosa_df["solution"].isna()].drop(columns=["generation_model", "generation_prompt", "raw_generation_responses", "solution", "generations"]), preserve_index=False)


In [320]:
new_ds.to_pandas()["input"].apply(len).describe()

count    4076.000000
mean     1568.375859
std       404.622201
min       368.000000
25%      1279.750000
50%      1536.500000
75%      1814.000000
max      4312.000000
Name: input, dtype: float64

In [297]:
cosa_df = cosa.to_pandas()
new_ds = Dataset.from_pandas(cosa_df[cosa_df["solution"].isna()].drop(columns=["generation_model", "generation_prompt", "raw_generation_responses", "solution", "generations"]), preserve_index=False)


Dataset({
    features: ['code_snippet', 'input'],
    num_rows: 3100
})

In [553]:
import json
d = json.loads("""{
  "dag": {
    "directed": true,
    "multigraph": false,
    "graph": [],
    "nodes": [
      {
        "step": {
          "name": "load_dataset",
          "batch_size": 50,
          "_type_info_": {
            "module": "distilabel.pipeline.step.generators.huggingface",
            "name": "LoadHubDataset"
          }
        },
        "id": "load_dataset"
      },
      {
        "step": {
          "name": "rename_columns",
          "_type_info_": {
            "module": "test_pipe_simple",
            "name": "RenameColumns"
          }
        },
        "id": "rename_columns"
      },
      {
        "step": {
          "name": "generate_response",
          "_type_info_": {
            "module": "test_pipe_simple",
            "name": "GenerateResponse"
          }
        },
        "id": "generate_response"
      }
    ],
    "adjacency": [
      [
        {
          "id": "rename_columns"
        }
      ],
      [
        {
          "id": "generate_response"
        }
      ],
      []
    ],
    "_type_info_": {
      "module": "distilabel.pipeline._dag",
      "name": "DAG"
    }
  },
  "_type_info_": {
    "module": "distilabel.pipeline.local",
    "name": "Pipeline"
  }
}""")
d
#hash(d["dag"])
[s for s in d["dag"]["nodes"]]
[s for s in d["dag"]["adjacency"]]
# INFO FROM NODES AND ADJACENCY
import hashlib

print('Hash for 181 is:', hash(181))

# hash for decimal
print('Hash for 181.23 is:',hash(181.23))

# hash for string
print('Hash for Python is:', hash('Python'))

hasher  = hashlib.sha1()
step_ids = [node["id"] for node in d["dag"]["nodes"]]
print(step_names)
adjacency_ids = [node["id"] for node in d["dag"]["adjacency"]]
print(step_names)
hasher.update("-".join(steps).encode())
hasher.hexdigest()

Hash for 181 is: 181
Hash for 181.23 is: 530343892119126197
Hash for Python is: -3205686617128269429
['load_dataset', 'rename_columns', 'generate_response']


'da39a3ee5e6b4b0d3255bfef95601890afd80709'

### Grab all the intermediate datasets

In [338]:
from datasets import load_dataset

ds_names = ["argilla/oss-instruct-solutions-step-1", "argilla/oss-instruct-solutions-step-1.5"]
dss = [load_dataset(ds_name, split="train") for ds_name in ds_names]

Downloading readme:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3100 [00:00<?, ? examples/s]

In [354]:
import pandas as pd
df1 = dss[0].to_pandas()
df2 = dss[1].to_pandas()

df_openai = pd.concat([df1[df1["solution"].notna()], df2[df2["solution"].notna()]]).reset_index().drop(columns="index")
df_openai

Unnamed: 0,code_snippet,generation_model,generation_prompt,raw_generation_responses,input,solution,generations
0,"\t\tsuper(debugName, CptLanguage.INSTANCE);\n\...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[**Problem:**\n\nYou are tasked with creating ...,**Problem:**\n\nYou are tasked with creating a...,[```python\nclass Book:\n def __init__(self...,
1,\t\t\t\t\t// расширенный поиск\n\t\t\t\t\tnew ...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[**Problem:**\n\nYou are working on developing...,**Problem:**\n\nYou are working on developing ...,"[To address the problem outlined, we will crea...",
2,class TagView(ListView):\n model = Tag\n ...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[You are tasked with developing a blogging pla...,You are tasked with developing a blogging plat...,[```python\ndef get_queryset(self):\n # Ret...,
3,export {};\n,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Imagine you are developing a new photo editin...,Imagine you are developing a new photo editing...,[Here is the implementation of the `enhanceCol...,
4,\tdef initialize(self):\n\n\t\t# check communi...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Your task is to implement a Python class that...,Your task is to implement a Python class that ...,[Here is the implementation of the `MusicPlaye...,
...,...,...,...,...,...,...,...
5095,"For a vertex ``v`` on the equator, if t...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Below is the implementation of the `opposite_...,You are tasked with developing a geographic vi...,[Below is the implementation of the `opposite_...,
5096,"\n <a href=""<?php echo base...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Here is a comprehensive and correct solution ...,An online platform consists of user-generated ...,[Here is a comprehensive and correct solution ...,
5097,"version=version,\n",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Here is the solution that addresses the probl...,You are tasked with creating a software progra...,[Here is the solution that addresses the probl...,
5098,"\t\telse if(name == ""UA:"")\n\t\t\tte->uvwA.x()...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Here is the implementation of the `updateUVWC...,**Problem:**\n\nYou are developing a software ...,[Here is the implementation of the `updateUVWC...,


In [355]:
ds_names = [
    "argilla/oss-instruct-solutions-step-2-magicoder",
    "argilla/oss-instruct-solutions-step-2-notus",
    "argilla/oss-instruct-solutions-step-2-wizardcoder"
]
from datasets import load_dataset

dss = [load_dataset(ds_name, split="train") for ds_name in ds_names]

Downloading readme:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5100 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5100 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5100 [00:00<?, ? examples/s]

In [371]:
df_openai

Unnamed: 0,code_snippet,generation_model,generation_prompt,raw_generation_responses,input,solution,generations
0,"\t\tsuper(debugName, CptLanguage.INSTANCE);\n\...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[**Problem:**\n\nYou are tasked with creating ...,**Problem:**\n\nYou are tasked with creating a...,[```python\nclass Book:\n def __init__(self...,
1,\t\t\t\t\t// расширенный поиск\n\t\t\t\t\tnew ...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[**Problem:**\n\nYou are working on developing...,**Problem:**\n\nYou are working on developing ...,"[To address the problem outlined, we will crea...",
2,class TagView(ListView):\n model = Tag\n ...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[You are tasked with developing a blogging pla...,You are tasked with developing a blogging plat...,[```python\ndef get_queryset(self):\n # Ret...,
3,export {};\n,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Imagine you are developing a new photo editin...,Imagine you are developing a new photo editing...,[Here is the implementation of the `enhanceCol...,
4,\tdef initialize(self):\n\n\t\t# check communi...,[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Your task is to implement a Python class that...,Your task is to implement a Python class that ...,[Here is the implementation of the `MusicPlaye...,
...,...,...,...,...,...,...,...
5095,"For a vertex ``v`` on the equator, if t...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Below is the implementation of the `opposite_...,You are tasked with developing a geographic vi...,[Below is the implementation of the `opposite_...,
5096,"\n <a href=""<?php echo base...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Here is a comprehensive and correct solution ...,An online platform consists of user-generated ...,[Here is a comprehensive and correct solution ...,
5097,"version=version,\n",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Here is the solution that addresses the probl...,You are tasked with creating a software progra...,[Here is the solution that addresses the probl...,
5098,"\t\telse if(name == ""UA:"")\n\t\t\tte->uvwA.x()...",[gpt-3.5-turbo],[[{'content': 'You are exceptionally skilled a...,[Here is the implementation of the `updateUVWC...,**Problem:**\n\nYou are developing a software ...,[Here is the implementation of the `updateUVWC...,


In [514]:
#dfs = []
#for ds in dss:
#    df = ds.to_pandas()[["code_snippet", "input", "solution"]]
#    dfs.append(df)
ds_by_name = {
    "ise-uiuc/Magicoder-S-DS-6.7B": dss[0],
    "argilla/notus-7b-v1": dss[1],
    "WizardLM/WizardCoder-15B-V1.0": dss[2]
}
ds_openai = Dataset.from_pandas(df_openai[["code_snippet", "generation_model", "input", "solution"]], preserve_index=False)
from tqdm import tqdm
from collections import defaultdict
ds_dict = defaultdict(list)

for i in tqdm(range(len(ds_openai)), total=len(ds_openai)):
    generation_models = ["gpt-3.5-turbo"]
    generations = ds_openai[i]["solution"]
    for generation_model, ds in ds_by_name.items():
        generation_models.append(generation_model)
        generations.append(ds["solution"][i][0])
#    ds_dict[i]["generation_model"] = generation_models
    ds_dict["code_snippet"].append(ds_openai[i]["code_snippet"])
    ds_dict["generation_model"].append(generation_models)
    ds_dict["problem"].append(ds_openai[i]["input"])
    ds_dict["solutions"].append(generations)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5100/5100 [01:51<00:00, 45.94it/s]


In [520]:
new_ds = Dataset.from_dict(ds_dict)
new_ds.push_to_hub("argilla/disticoder-dpo-v2-unlabelled", split="train", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/argilla/disticoder-dpo-v2-unlabelled/commit/45bf0f5549b7e6557e9b2aab9d92da7b34c7c6c2', commit_message='Upload dataset', commit_description='', oid='45bf0f5549b7e6557e9b2aab9d92da7b34c7c6c2', pr_url=None, pr_revision=None, pr_num=None)

In [523]:
    from textwrap import dedent
    text_description = dedent("""# Code Quality Assessment
    Evaluate the model's generated code based on various criteria:
    1. **Correctness**: Does the code produce the expected output and perform the intended task without errors?
    2. **Maintainability**: Is the code well-structured, easy to understand, and easy to modify?
    3. **Performance**: How well-optimized is the code in terms of runtime performance and resource usage?
    4. **Consistency & Coding Standards**: Does the code follow established coding conventions and maintain a consistent coding style?
    Your role is to provide a holistic assessment considering all the above factors.
    
    **Scoring**: Rate outputs 1 to 5 based on the overall quality, considering all aspects:
    """)
print(text_description)

# Code Quality Assessment
Evaluate the model's generated code based on various criteria:
1. **Correctness**: Does the code produce the expected output and perform the intended task without errors?
2. **Maintainability**: Is the code well-structured, easy to understand, and easy to modify?
3. **Performance**: How well-optimized is the code in terms of runtime performance and resource usage?
4. **Consistency & Coding Standards**: Does the code follow established coding conventions and maintain a consistent coding style?
Your role is to provide a holistic assessment considering all the above factors.

**Scoring**: Rate outputs 1 to 5 based on the overall quality, considering all aspects:



In [None]:
import random
idx = random.randint(0, len(new_ds))
print(idx)
print(new_ds[idx]["problem"])
for i, g in enumerate(new_ds[idx]["solutions"]):
    print("************")
    print("MODEL: ", new_ds[idx]["generation_model"][i])
    print("************")
    print(g)
    print("************")

In [359]:
# ORIGIN DATASET
DATASET_NAME_PROBLEMS = "argilla/oss-instruct-problems-step-1"

oss_instruct_ds = load_dataset(DATASET_NAME_PROBLEMS, split="train")

ds_second = (
    oss_instruct_ds
    .rename_column("input", "code_snippet")
    .rename_column("problem", "input")
    .map(lambda ex: {"input": ex["input"][0]})
    .remove_columns("generations")
)

---

In [271]:
oss_problems = (
    oss_instruct_ds
    .rename_column("input", "seed")
    .rename_column("problem", "input")
    .remove_columns(["generation_model", "generation_prompt", "raw_generation_responses", "generations"])
)

In [273]:
ds_oss_solutions = pipe_generation_solutions.generate(
    dataset=oss_problems,
    num_generations=1,
    batch_size=8,
    checkpoint_strategy=None,
)

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

---

In [556]:
from datasets import load_dataset
from distilabel.tasks.preference.ultrafeedback import UltraFeedbackTask, Rating

ds = load_dataset("argilla/disticoder-dpo-v2-unlabelled", split="train")
# Prepare it for the labelling step
ds = (
    ds.rename_column("problem", "input").rename_column("solutions", "generations")
)

ratings = [
    Rating(
        value=1,
        description="**Low Quality**: Code is incorrect, inconsistent, and inefficient.",
    ),
    Rating(
        value=2,
        description="**Moderate Quality**: Code has major errors and inconsistencies, affecting overall functionality.",
    ),
    Rating(
        value=3,
        description="**Good**: Code is partially correct with noticeable issues.",
    ),
    Rating(
        value=4,
        description="**Very Good**: Code is mostly correct and consistent.",
    ),
    Rating(
        value=5,
        description="**Excellent**: Code is entirely correct, consistent, and efficient.",
    ),
]

from textwrap import dedent
text_description = dedent("""# Code Quality Assessment
Evaluate the model's generated code based on various criteria:
1. **Correctness**: Does the code produce the expected output and perform the intended task without errors?
2. **Maintainability**: Is the code well-structured, easy to understand, and easy to modify?
3. **Performance**: How well-optimized is the code in terms of runtime performance and resource usage?
4. **Consistency & Coding Standards**: Does the code follow established coding conventions and maintain a consistent coding style?
Your role is to provide a holistic assessment considering all the above factors.

**Scoring**: Rate outputs 1 to 5 based on the overall quality, considering all aspects:
""")

uf_code_quality = UltraFeedbackTask(
    system_prompt="Your role is to evaluate code quality based on given criteria.",
    task_description=text_description,
    ratings=ratings,
)

labelling_task = "code_quality"
tasks = {
    "overall_quality": UltraFeedbackTask.for_overall_quality(),
    "instruction_following": UltraFeedbackTask.for_instruction_following(),
    "honesty": UltraFeedbackTask.for_honesty(),
    "truthfulness": UltraFeedbackTask.for_truthfulness(),
    "code_quality": uf_code_quality
}
if labelling_task in tasks.keys():
    tasks = {labelling_task: tasks[labelling_task]}
else:
    raise ValueError(f"The task must be one of: {tasks.keys()}")

print("Selected the following tasks:", tasks.keys())
# Create the labeller pipelines with the different tasks

labeller_pipelines = {}


Selected the following tasks: dict_keys(['code_quality'])


In [560]:
tasks["code_quality"]

UltraFeedbackTask(ratings=[{'value': 1, 'description': '**Low Quality**: Code is incorrect, inconsistent, and inefficient.'}, {'value': 2, 'description': '**Moderate Quality**: Code has major errors and inconsistencies, affecting overall functionality.'}, {'value': 3, 'description': '**Good**: Code is partially correct with noticeable issues.'}, {'value': 4, 'description': '**Very Good**: Code is mostly correct and consistent.'}, {'value': 5, 'description': '**Excellent**: Code is entirely correct, consistent, and efficient.'}], task_description="# Code Quality Assessment\nEvaluate the model's generated code based on various criteria:\n1. **Correctness**: Does the code produce the expected output and perform the intended task without errors?\n2. **Maintainability**: Is the code well-structured, easy to understand, and easy to modify?\n3. **Performance**: How well-optimized is the code in terms of runtime performance and resource usage?\n4. **Consistency & Coding Standards**: Does the c

In [570]:
labeller_pipeline = Pipeline(
    labeller=OpenAILLM(
        model="gpt-4-1106-preview",  # gpt-4 turbo
        task=UltraFeedbackTask.for_instruction_following(),
#        task=tasks["code_quality"],
        max_new_tokens=512,
        num_threads=8,
        api_key=os.getenv("OPENAI_API_KEY"),
        temperature=0.3
    )
)

In [566]:
os.environ["HF_API_TOKEN"] =  "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"
os.getenv("HF_API_TOKEN")


'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'

In [571]:
from distilabel.dataset import DatasetCheckpoint

checkpoint = DatasetCheckpoint(
    strategy="hf-hub",
    extra_kwargs={
        "repo_id": f"argilla/disticoder-dpo-v2-{'test-2'}",
        "token": os.getenv("HF_API_TOKEN"),
        "private": True,
        "split": "train"
    },
    save_frequency=10
)

new_ds = labeller_pipeline.generate(
    ds.select(range(5)),
    num_generations=1,
    batch_size=16,
    checkpoint_strategy=checkpoint,
)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/agus/.cache/huggingface/token
Login successful


Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/689 [00:00<?, ?B/s]

---

In [239]:
idx = random.randint(0, len(ds_oss_solutions))
print(ds_oss_solutions[0]["solution"][0])

Here is the Python code for the `preprocess` function that handles the preprocessing of the given parameters `arg_negate` and `arg_is_regex`:

```python
def preprocess(data, lang):
    # Parse arg_negate to boolean value, default False
    data['arg_negate'] = data.get('arg_negate', 'false').lower() == 'true'

    # Parse arg_is_regex to boolean value, default False
    data['arg_is_regex'] = data.get('arg_is_regex', 'false').lower() == 'true'

    return data

# Input dictionary
data = {
    "arg_negate": "true",
    "arg_is_regex": "false"
}

lang = "Python"

# Preprocess the input data dictionary
updated_data = preprocess(data, lang)

# Display the resulting dictionary after preprocessing
print(updated_data)
```

When running the given input dictionary through the `preprocess` function, the output dictionary with the updated values for `arg_negate` and `arg_is_regex` will be:

```python
{'arg_negate': True, 'arg_is_regex': False}
```

This output confirms that the function correctly

In [265]:
print(ds_oss_solutions[4]["solution"][0])

```swift
struct Course {
    let id: Int
    let name: String
    let courseCode: String
    var accountId: Int
}

struct CourseRegistrationSystem {
    var courses: [Course] = []

    func registerForCourse(courseId: Int, accountId: Int) -> String {
        if let courseIndex = courses.firstIndex(where: { $0.id == courseId }) {
            if courses[courseIndex].accountId == accountId {
                return "Registration successful for course: \(courses[courseIndex].name)"
            } else {
                return "Error: Invalid account ID for course registration"
            }
        } else {
            return "Error: Course with ID \(courseId) not found"
        }
    }

    func getCourseDetails(courseId: Int) -> Course? {
        return courses.first(where: { $0.id == courseId })
    }

    mutating func updateCourseAccount(courseId: Int, newAccountId: Int) -> String {
        if let courseIndex = courses.firstIndex(where: { $0.id == courseId }) {
            courses[cours

In [274]:
import argilla as rg
#dataset = dstaset.select(range(10))

fields = [
#    rg.TextField(name="seed", title="seed", use_markdown=True),
    rg.TextField(name="problem", title="problem", use_markdown=True),
    rg.TextField(name="solution", title="solution", use_markdown=True)
]
questions = [
    rg.TextQuestion(
        name="explorer",
        title="Review the fields of the dataset.",
        required=True,
        use_markdown=True
    )
]

# Create dataset
rg_dataset = rg.FeedbackDataset(
    fields=fields,
    questions=questions,
)

In [275]:
# Add rows
for row in ds_oss_solutions:  # oss_instruct_og
    try:
        fields = {
#            "seed": row["input"],
            "problem": row["input"],
            "solution": row["solution"][0],
        }
        suggestions = []
        if "chosen_rationale" in oss_instruct_ds.column_names and "rejected_rationale" in oss_instruct_ds.column_names:
            suggestions = [
                {
                    "question_name": "chosen-rationale",
                    "value": row["chosen_rationale"]
                },
                {
                    "question_name": "rejected-rationale",
                    "value": row["rejected_rationale"]
                }
            ]
        rg_dataset.add_records(rg.FeedbackRecord(fields=fields, suggestions=suggestions))
    except Exception as e:
        print(e)



In [276]:
rg_dataset_name = "disticoder-problem-solution-set-v5"
rg_dataset.push_to_argilla(name=rg_dataset_name, workspace="admin")

Output()

RemoteFeedbackDataset(
   id=7ea76f7f-c424-4cb2-8b71-993799e88586
   name=disticoder-problem-solution-set-v5
   workspace=Workspace(id=61543500-24a8-4a5e-b91d-65c7ce541c6e, name=admin, inserted_at=2024-02-16 11:48:08.311578, updated_at=2024-02-16 11:48:08.311578)
   url=https://plaguss-distilabel-dataset-cddf4f3e.hf.space/dataset/7ea76f7f-c424-4cb2-8b71-993799e88586/annotation-mode
   fields=[RemoteTextField(id=UUID('f2dc1538-d48b-463f-b722-8d9de19261db'), client=None, name='problem', title='problem', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('96263b4e-abf9-4efd-811b-6a1de20262c8'), client=None, name='solution', title='solution', required=True, type='text', use_markdown=True)]
   questions=[RemoteTextQuestion(id=UUID('e83fed2a-9ffc-4772-8140-0a1150397cc6'), client=None, name='explorer', title='Review the fields of the dataset.', description=None, required=True, type='text', use_markdown=True)]
   guidelines=None
   metadata_properties=[]
   vectors_setting