In [1]:
!pip install huggingface_hub pydantic dataset datasets



In [None]:
!pip install "distilabel[hf-transformers, openai]>=1.0.0"

Collecting distilabel>=1.0.0 (from distilabel[hf-transformers,openai]>=1.0.0)
  Downloading distilabel-1.3.2-py3-none-any.whl.metadata (13 kB)
Collecting httpx>=0.25.2 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson>=3.10.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker>=2.8.2 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting universal-pathlib>=0.2.2 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading universal_pathlib-0.2.5-py3-none-any.whl.metadata (25 kB)
Collecting openai>=1.0.0 (from disti

In [3]:
pip install distilabel[outlines]

Collecting outlines>=0.0.40 (from distilabel[outlines])
  Downloading outlines-0.0.46-py3-none-any.whl.metadata (15 kB)
Collecting interegular (from outlines>=0.0.40->distilabel[outlines])
  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)
Collecting lark (from outlines>=0.0.40->distilabel[outlines])
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Collecting diskcache (from outlines>=0.0.40->distilabel[outlines])
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting pycountry (from outlines>=0.0.40->distilabel[outlines])
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting pyairports (from outlines>=0.0.40->distilabel[outlines])
  Downloading pyairports-2.1.1-py3-none-any.whl.metadata (1.7 kB)
Downloading outlines-0.0.46-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.9/101.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading diskcache-5.6.3-py3-none-any.whl

In [5]:
from huggingface_hub import get_token
from huggingface_hub import InferenceClient
from datasets import load_dataset
from toolz import take
from datasets import get_dataset_config_names
import torch, json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from pydantic import Field
from distilabel.steps import Step, StepInput
from distilabel.steps.typing import StepOutput
from typing import List
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub, KeepColumns, LoadDataFromDicts
from distilabel.steps.tasks import TextGeneration
from distilabel.llms import TransformersLLM

In [6]:
class GeneratePrompts(Step):

    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return ['text']

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['text', 'instruction']

    def process(self, inputs: StepInput) -> StepOutput:

        for example in inputs:
          instruction = f"""
            Let's write abstract descriptions of sentences. Example:
            Sentence: Pilate's role in the events leading to the crucifixion lent themselves to melodrama , even tragedy , and Pilate often has a role in medieval mystery plays .
            Description: A description of a historical religious figure's involvement in a significant event and its later portrayal in art.
            Note: Descriptions can differ in the level of abstraction, granularity and the part of the sentence they focus on. Some descriptions need to be abstract, while others should be concrete and detailed.
            For the following sentence, write up 5 good and stand-alone, independent descriptions and 5 bad descriptions (which may be related, but are clearly wrong). Output a json file with keys 'good', 'bad'.
            Sentence: {example['text']}
            Start your answer with a curly bracket.
            """
          example['instruction'] = instruction

        yield inputs

In [24]:
class ParseDescriptions(Step):

    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return ['descriptions']

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['good_description', "bad_description"]

    def process(self, inputs: StepInput) -> StepOutput:

        new_generations = []
        for example in inputs:
          try:
            parsed = json.loads(example['descriptions'])
            for good, bad in zip(parsed['good'], parsed['bad']):
              new_generations.append({
                'text': example['text'],
                'instruction': example['instruction'],
                'good_description': good,
                'bad_description': bad
              })
          except:
            print("Can't parse descriptions")
            print(example['descriptions'])

        yield inputs

In [4]:
from pydantic import BaseModel

class Description(BaseModel):
    good: list[str]
    bad: list[str]

In [25]:
with Pipeline(name='sentence-similarity-pipeline') as synthesis_pipeline:
    load_data_from_hub = LoadDataFromHub(
        name='load-data-from-hub',
    )

    generate_prompts = GeneratePrompts(
        name='generate-prompts',
    )

    synthesizer = TextGeneration(
        name='description-text-genration',
        llm = TransformersLLM(
            model="microsoft/Phi-3.5-mini-instruct",
            device="cuda:0",
            structured_output={"format": "json", "schema": Description}
        ),
        output_mappings={"generation": "descriptions"}
    )

    parse_descriptions = ParseDescriptions(
        name='parse-descriptions',
    )

    load_data_from_hub >> generate_prompts >> synthesizer >> parse_descriptions


In [26]:
distiset = synthesis_pipeline.run(
    parameters={
        load_data_from_hub.name: {
            'repo_id': "ahsanirfan961/title-content-dataset",
            "split": "train"
        },
        synthesizer.name: {
            "llm": {
                "generation_kwargs": {
                    "max_new_tokens": 1024,
                    "temperature": 0.7,
                },
            },
        }
    },
    use_cache=False
)

  self.pid = os.fork()


  return [self.format_input(input) for input in inputs]


You are not running the flash-attention implementation, expect numerical differences.


Generating train split: 0 examples [00:00, ? examples/s]

In [27]:
print(distiset)

Distiset({
    default: DatasetDict({
        train: Dataset({
            features: ['title', 'text', 'instruction', 'descriptions', 'distilabel_metadata', 'model_name', 'good_description', 'bad_description'],
            num_rows: 9
        })
    })
})


In [28]:
print(distiset['default']['train'].to_pandas())

                                               title  \
0  U.S. Navy plane crashes in Philippine Sea, thr...   
1  Zimbabwe's Mnangagwa arrives home, to be sworn...   
2  Cleveland officials to outline security for Re...   
3  U.S. senator calls for GAO probe to protect ba...   
4  Republican ideas for healthcare reforms could ...   
5  White House says 'couple of dozen' people stil...   
6  Democratic leaders Schumer, Pelosi to dine wit...   
7  Famine survey warns of thousands dying daily i...   
8  Qatar calls Trump's Jerusalem move 'death sent...   

                                                text  \
0  a us navy transport plane carrying  people cra...   
1  zimbabwe s former vice president emmerson mnan...   
2  the mayor and police chief of cleveland next w...   
3  the ranking democrat on the us senate subcommi...   
4  president donald trump’s push to fulfill a cam...   
5  the white house on sunday defended its impleme...   
6  us president donald trump invited senate dem

In [23]:
print(distiset['default']['train'].to_pandas()['generation'][3])

{ "good": [ "A political representative seeks oversight into how well US states adhere to laws aimed at safeguarding infants exposed to drugs via maternal use.", ", An inquiry regarding improved implementation measures within Congressional framework concerning infant safety amidst parental narcotic misuse cases." ], "bad": ["An analysis detailing economic impact studies linked to urban development policies.", "Description focused solely on culinary advancements introduced between two distinct geopolitical entities."] }
