In [1]:
!pip install "distilabel[hf-transformers, openai]>=1.0.0"

Collecting openai>=1.0.0 (from distilabel[hf-transformers,openai]>=1.0.0)
  Downloading openai-1.42.0-py3-none-any.whl.metadata (22 kB)
Collecting jiter<1,>=0.4.0 (from openai>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.0.0

In [2]:
from datasets import DatasetDict, Dataset
import pandas as pd
from distilabel.llms import TransformersLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub, KeepColumns, LoadDataFromDicts
from distilabel.steps import Step, StepInput
from distilabel.steps.typing import StepOutput
from distilabel.steps.tasks import TextGeneration, SelfInstruct
from typing import List
from pydantic import Field

In [3]:
HF_AUTH_TOKEN='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'
from huggingface_hub import login
login(token=HF_AUTH_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
criteria_for_query_generation = (
    "1. Relevance: Ensure the questions are directly related to the content and context of the input paragraph."
    "2. Diversity: Include a variety of question types such as factual, analytical, inferential, and evaluative."
    "3. Clarity: Make sure each question is clear, concise, and unambiguous."
    "4. Complexity: Incorporate questions of varying difficulty levels, from simple recall to complex analysis."
    "5. Coverage: Cover the entire content of the paragraph, addressing different sections and key points."
    "6. Specificity: Frame questions to be specific and pointed, encouraging precise answers."
    "7. Engagement: Create questions that are interesting and engaging, promoting thoughtful responses."
    "8. Open-endedness: A portion of the generated questions should encourage creative and thoughtful responses, rather than simple factual recall."
    "9. Output: Provide only the five user queries without any introductory or explanatory text."
)

application_description = "This AI assistant is designed to generate a series of relevant and thought-provoking questions based on the provided context or input. The goal is to generate questions that cover different aspects of the topic without providing answers. The goal is to create an AI that can simulate human-like understanding and reasoning to respond to any query effectively."

In [5]:
# Defining Instruction Splitter class
class InstructionSplitter:
  def split_instructions_from_dataset(self, dataset: Dataset):
    new_rows = []
    for row in dataset:
      new_rows.extend(self.split_instructions_from_row(row))
    return new_rows

  def split_instructions_from_row(self, row):
      results = []
      for instruction in row['instructions']:
          result = row.copy()
          result['instruction'] = instruction
          del result['instructions']
          results.append(result)
      return results

In [6]:
class SplitInstructions(Step):
    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return ['instructions']

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['instruction']

    def process(self, inputs: StepInput) -> StepOutput:
        inputs = InstructionSplitter().split_instructions_from_dataset(inputs)
        yield inputs

In [16]:
with Pipeline(name="Question Generation") as pipeline:
    load_hub_dataset = LoadDataFromHub(
        name="load_dataset",
        output_mappings={"prompt": "input"}
    )

    self_instruct = SelfInstruct(
        llm = TransformersLLM(model="Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct", device= "cuda:0"),
        input_batch_size=1,
        add_raw_output=False,
        num_instructions=5,
        criteria_for_query_generation=criteria_for_query_generation,
        application_description=application_description,
        output_mappings={"model_name": "instruction_model"},
    )

    split_instr = SplitInstructions(
        name="split_instructions_step"
    )

    answer_generation = TextGeneration(
        llm = TransformersLLM(model="Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct", device= "cuda:0"),
        input_batch_size=1,
        add_raw_output=False,
        output_mappings={"generation": "response", "model_name": "response_model"},
    )

    keep_columns = KeepColumns(
        columns = ["input", "instruction", "response", "instruction_model", "response_model"]
    )

    load_hub_dataset >> self_instruct >> split_instr >> answer_generation >> keep_columns

In [27]:
distiset = pipeline.run(
    parameters={
        load_hub_dataset.name: {
            "repo_id": "hassaan-qaisar/initial_prompt",
            "split": "train",
        },
        self_instruct.name: {
            "llm": {
                "generation_kwargs": {
                    "max_new_tokens": 256,
                    "temperature": 0.7,
                },
            },
        },
    },
)

In [18]:
print(distiset)

Distiset({
    default: DatasetDict({
        train: Dataset({
            features: ['input', 'instruction', 'response', 'instruction_model', 'response_model'],
            num_rows: 25
        })
    })
})


In [22]:
print(distiset['default']['train'].to_pandas())

                                                input  \
0   Renewable energy sources such as solar, wind, ...   
1   Gardening is a relaxing and rewarding hobby th...   
2   Gardening is a relaxing and rewarding hobby th...   
3   Gardening is a relaxing and rewarding hobby th...   
4   Gardening is a relaxing and rewarding hobby th...   
5   Gardening is a relaxing and rewarding hobby th...   
6   Gardening is a relaxing and rewarding hobby th...   
7   Gardening is a relaxing and rewarding hobby th...   
8   Gardening is a relaxing and rewarding hobby th...   
9   Gardening is a relaxing and rewarding hobby th...   
10  Gardening is a relaxing and rewarding hobby th...   
11  We basically want to make the safe use of an a...   
12  We basically want to make the safe use of an a...   
13  We basically want to make the safe use of an a...   
14  We basically want to make the safe use of an a...   
15  We basically want to make the safe use of an a...   
16  We basically want to make t