In [2]:
!pip install "distilabel[hf-transformers, openai]>=1.0.0"

Collecting distilabel>=1.0.0 (from distilabel[hf-transformers,openai]>=1.0.0)
  Downloading distilabel-1.3.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets>=2.16.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting httpx>=0.25.2 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting multiprocess>=0.70 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting orjson>=3.10.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker>=2.8.2 (from distilabel>=

In [3]:
from datasets import DatasetDict, Dataset
import pandas as pd
from distilabel.llms import TransformersLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub, KeepColumns, LoadDataFromDicts
from distilabel.steps import Step, StepInput
from distilabel.steps.typing import StepOutput
from distilabel.steps.tasks import TextGeneration, SelfInstruct, Magpie
from typing import List
from pydantic import Field

In [4]:
HF_AUTH_TOKEN='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'
from huggingface_hub import login
login(token=HF_AUTH_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [43]:
# Defining Instruction Splitter class
class ListSplitter:

  split_column = 'instructions'

  def __init__(self, split_column) -> None:
     self.split_column = split_column

  def split_instructions_from_dataset(self, dataset: Dataset):
    new_rows = []
    for row in dataset:
      new_rows.extend(self.split_instructions_from_row(row))
    return new_rows

  def split_instructions_from_row(self, row):
      results = []
      for instruction in row[self.split_column]:
          result = row.copy()
          result['splitted'] = instruction
          del result[self.split_column]
          results.append(result)
      return results

In [45]:
class SplitList(Step):

    split_column: str = Field(..., description="The column containing list to split")

    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return [self.split_column]

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['splitted']

    def process(self, inputs: StepInput) -> StepOutput:
        inputs = ListSplitter(self.split_column).split_instructions_from_dataset(inputs)
        yield inputs

In [66]:
class ConversationToInstructPairs(Step):
    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return ['conversation']

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['instructions', 'responses']

    def process(self, inputs: StepInput) -> StepOutput:
        for example in inputs:
          instructions = []
          responses = []
          for talk in example['conversation']:
            if talk['role'] == 'user':
              instructions.append(talk['content'])
            else:
              responses.append(talk['content'])
          example['instructions'] = instructions
          example['responses'] = responses
          example.pop('conversation')
        yield inputs

In [1]:
data = [
    {
        "system_prompt": "You are a helpful Al assistant. The user will engage in a multi−round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and insightful responses to help the user with their queries."
    },
    {
        "system_prompt": "You are a helpful Al assistant. The user will engage in a multi−round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and insightful responses to help the user with their queries."
    },
]

In [72]:
with Pipeline(name="Question Generation") as pipeline:
    load_hub_dataset = LoadDataFromDicts(
        name="load_dataset",
        data=data
    )

    magpie = Magpie(
        llm=TransformersLLM(
            # model="unsloth/Phi-3.5-mini-instruct",
            model="Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct",
            magpie_pre_query_template="llama3",
            generation_kwargs={
                "temperature": 1.0,
                "max_new_tokens": 256,
            },
            device="cuda:0",
        ),
        # only_instruction=True,
        n_turns =2
    )

    instruct_res_pairs = ConversationToInstructPairs(
        name="conversation_to_instruct_pairs",
    )

    split_instructions = SplitList(
        name="split_instructions",
        split_column = 'instructions',
        output_mappings={"splitted": "instruction"}
    )

    split_responses = SplitList(
        name="split_responses",
        split_column = 'responses',
        output_mappings={"splitted": "response"}
    )

    keep_columns = KeepColumns(
        columns = ["instruction", "response"]
    )

    load_hub_dataset >> magpie >> instruct_res_pairs >> split_instructions >> split_responses >> keep_columns

In [73]:
distiset = pipeline.run(
    parameters={
        # load_hub_dataset.name: {
        #     "repo_id": "ahsanirfan961/synthetic-data-seeds",
        #     "split": "train",
        # },
    },
    # use_cache=False
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Generating train split: 0 examples [00:00, ? examples/s]

In [74]:
print(distiset)

Distiset({
    default: DatasetDict({
        train: Dataset({
            features: ['instruction', 'response'],
            num_rows: 8
        })
    })
})


In [75]:
print(distiset['default']['train'].to_pandas())

                                         instruction  \
0   To add more personality to your responses cho...   
1   To add more personality to your responses cho...   
2                                                      
3                                                      
4  \nBasicData Entry jobs in Sears Loyalty Soluti...   
5  \nBasicData Entry jobs in Sears Loyalty Soluti...   
6  \nThe Mental Talkoff from mentaltherapist.com ...   
7  \nThe Mental Talkoff from mentaltherapist.com ...   

                                            response  
0   If you need specific information about how to...  
1   You will have access to five AA Baden task ov...  
2   If you need specific information about how to...  
3   You will have access to five AA Baden task ov...  
4   If the user have any available documents, map...  
5   Teamwork Matters | Wellness Check Adis POS Sy...  
6   If the user have any available documents, map...  
7   Teamwork Matters | Wellness Check Adis POS Sy...  


In [34]:
print(distiset['default']['train'].to_pandas()['conversation'][0])

[{'content': " This task typically includes responding to multiple round questions by clarifying more vague or general inquiries from the source.\nBe comfortable sharing your knowledge and indicate areas where you may have researched.\nDemonstrate responsibility towards your work, maintaining strict confidentiality at all times, and using unique and original content when responding to the customer's inquiries using collected information related to the services that I assist.\nUtilize LinkedIn for quick yet detailed conversations with customer accounts.", 'role': 'user'}
 {'content': '', 'role': 'assistant'}
 {'content': " After each conversation, ensure to produce an audio recording to review and improve your performance. Welcome to This topic! To participate in this Al assistance role, you will… •\tAssist people by providing excellent customer service in multiple rounds of conversation while using numerous, important an• Ones Supports; graphic design specialists for free access to opp