In [1]:
!pip install vllm datasets
!git clone https://github.com/BatsResearch/bonito.git
!cd bonito && pip install -e .

Collecting vllm
  Downloading vllm-0.6.1.post2-cp38-abi3-manylinux1_x86_64.whl.metadata (2.4 kB)
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece (from vllm)
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting tqdm (from vllm)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting py-cpuinfo (from vllm)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting transformers>=4.43.2 (from vllm)
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers>=0.19.1 (from vllm)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinu

In [2]:
!pip install "distilabel[hf-transformers, openai]>=1.0.0"

Collecting distilabel>=1.0.0 (from distilabel[hf-transformers,openai]>=1.0.0)
  Downloading distilabel-1.3.2-py3-none-any.whl.metadata (13 kB)
Collecting nest-asyncio>=1.6.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading nest_asyncio-1.6.0-py3-none-any.whl.metadata (2.8 kB)
Collecting orjson>=3.10.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker>=2.8.2 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting rich>=13.5.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading rich-13.8.1-py3-none-any.whl.metadata (18 kB)
Collecting scipy>=1.10.0 (from distilabel>=1.0.0

In [29]:
from bonito import Bonito
from vllm import SamplingParams
from datasets import load_dataset, Dataset
from pydantic import Field
from distilabel.steps import Step, StepInput
from distilabel.steps.typing import StepOutput
from typing import List
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub, KeepColumns, LoadDataFromDicts

In [44]:
class GenerateBonito(Step):

    max_tokens: int = Field(..., description="The maximum number of tokens per generation")
    top_p: float = Field(..., description="The top_p value for sampling")
    temperature: float = Field(..., description="The temperature value for sampling")
    n: int = Field(..., description="The number of synthetic examples to generate")
    context_col: str = Field(..., description="The column name of the context in the input data")
    task_type: str = Field(..., description="The type of task to generate")
    model: str = Field(..., description="The model id to use for generation")

    def __init__(self, name, **data):
        super().__init__(name=name, **data)
        self._bonito = None

    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return ['input']

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['input', 'output']

    def process(self, inputs: StepInput) -> StepOutput:
        
        if self._bonito is None:
            self._bonito = Bonito(self.model)

        sampling_params = SamplingParams(
            max_tokens=self.max_tokens, 
            top_p=self.top_p, 
            temperature=self.temperature, 
            n=self.n
        )

        input_dataset = Dataset.from_list(inputs)
        
        synthetic_dataset = self._bonito.generate_tasks(
            input_dataset,
            context_col=self.context_col,
            task_type=self.task_type,
            sampling_params=sampling_params
        )
        print(synthetic_dataset)
        yield synthetic_dataset.to_pandas().to_dict(orient='records')

In [45]:
with Pipeline(name='bonito-pipeline') as pipeline:
    load_data_from_hub = LoadDataFromHub(
        name='load-data-from-hub',
        output_mappings={"text": "input"}
    )

    synthesizer = GenerateBonito(
        name='synthesizer',
        input_batch_size = 1,
        max_tokens = 512,
        top_p = 0.5,
        temperature = 0.7,
        n = 5,
        context_col = 'input',
        task_type = 'qa',
        model = 'BatsResearch/bonito-v1',
    )
    
    load_data_from_hub >> synthesizer 


In [46]:
distiset = pipeline.run(
    parameters={
        load_data_from_hub.name: {
            'repo_id': "ahsanirfan961/title-content-dataset",
            "split": "train"
        },
    },
    use_cache=False
)

INFO 09-14 06:43:45 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='BatsResearch/bonito-v1', speculative_config=None, tokenizer='BatsResearch/bonito-v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=BatsResearch/bonito-v1, use_v2_block_manager=False, num_scheduler_steps=1, enable_prefix_caching=False, use_async_output_

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 09-14 06:44:00 model_runner.py:997] Starting to load model BatsResearch/bonito-v1...
INFO 09-14 06:44:00 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 09-14 06:44:00 selector.py:116] Using XFormers backend.
INFO 09-14 06:44:00 weight_utils.py:242] Using model weights format ['*.bin']


Loading pt checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


  state = torch.load(bin_file, map_location="cpu")


INFO 09-14 06:44:10 model_runner.py:1008] Loading model weights took 13.4966 GB
INFO 09-14 06:44:14 gpu_executor.py:122] # GPU blocks: 11426, # CPU blocks: 2048
INFO 09-14 06:44:18 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-14 06:44:18 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-14 06:44:32 model_runner.py:1430] Graph capturing finished in 14 secs.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it, est. speed input: 410.37 toks/s, output: 150.03 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.52s/it, est. speed input: 86.72 toks/s, output: 147.81 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.38s/it, est. speed input: 373.08 toks/s, output: 142.71 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.79s/it, est. speed input: 269.95 toks/s, output: 120.47 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.68s/it, est. speed input: 553.58 toks/s, output: 124.08 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it, est. speed input: 92.32 toks/s, output: 126.16 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it, est. speed input: 80.15 toks/s, output: 142.24 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.82s/it, est. speed input: 284.01 toks/s, output: 122.19 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.85s/it, est. speed input: 70.80 toks/s, output: 127.54 toks/s]


Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 5
})


Generating train split: 0 examples [00:00, ? examples/s]

In [47]:
print(distiset)

Distiset({
    default: DatasetDict({
        train: Dataset({
            features: ['input', 'output'],
            num_rows: 45
        })
    })
})


In [48]:
print(distiset['default']['train'].to_pandas())

                                                input  \
0   a us navy transport plane carrying  people cra...   
1   a us navy transport plane carrying  people cra...   
2   a us navy transport plane carrying  people cra...   
3   a us navy transport plane carrying  people cra...   
4   a us navy transport plane carrying  people cra...   
5   zimbabwe s former vice president emmerson mnan...   
6   zimbabwe s former vice president emmerson mnan...   
7   zimbabwe s former vice president emmerson mnan...   
8   zimbabwe s former vice president emmerson mnan...   
9   zimbabwe s former vice president emmerson mnan...   
10  the mayor and police chief of cleveland next w...   
11  the mayor and police chief of cleveland next w...   
12  the mayor and police chief of cleveland next w...   
13  the mayor and police chief of cleveland next w...   
14  the mayor and police chief of cleveland next w...   
15  the ranking democrat on the us senate subcommi...   
16  the ranking democrat on the