In [1]:
from pydantic import Field
from distilabel.steps import Step, StepInput
from distilabel.steps.typing import StepOutput
from typing import List
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub, KeepColumns, LoadDataFromDicts
from distilabel.steps.tasks import TextGeneration
from distilabel.llms import TransformersLLM
import pandas as pd
import random

In [None]:
class SelectRandomValues(Step):

    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return ['noun', 'adjective', 'verb']

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['noun', 'adjective', 'verb']

    def process(self, inputs: StepInput) -> StepOutput:
        
        input_data = pd.DataFrame(inputs).to_dict(orient='list')
        yield [{
            "noun": random.choice(input_data['noun']),
            "adjective": random.choice(input_data['adjective']),
            "verb": random.choice(input_data['verb'])
        }]

In [2]:
class GeneratePrompts(Step):
    
    template: str = Field(..., description="The template to use for generating prompts.")

    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return []

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['instruction']

    def process(self, inputs: StepInput) -> StepOutput:

        for example in inputs:
            example['instruction'] = self.template
            for value in example.keys():
                example['instruction'] = example['instruction'].replace(f'{{value}}', example[value])

        yield inputs

In [3]:
with Pipeline(name='tiny-stories-pipeline') as tiny_pipeline:
    load_data_from_hub = LoadDataFromHub(
        name='load-data-from-hub',
    )

    select_random_values = SelectRandomValues(
        name='select-random-values'
    )

    generate_prompts = GeneratePrompts(
        name='generate-prompts',
        template="""
            Write a short story (3-5 paragraphs) which only uses very simple words that a 3 year old child would likely understand. The story should use the verb "{verb}", the noun "{noun}" and the adjective "{adjective}". The story should have the following features: the story should contain at least one dialogue, the story has a bad ending. Remember to only use simple words!
        """
    )

    synthesizer = TextGeneration(
        name='story-genration',
        llm = TransformersLLM(
            model="microsoft/Phi-3.5-mini-instruct",
            device="cuda:0",
        ),
        output_mappings={"generation": "story"}
    )


    load_data_from_hub >> select_random_values >> generate_prompts >> synthesizer


In [None]:
distiset = tiny_pipeline.run(
    parameters={
        load_data_from_hub.name: {
            'repo_id': "ahsanirfan961/noun-adj-verb",
            "split": "train"
        },
        # synthesizer.name: {
        #     "llm": {
        #         "generation_kwargs": {
        #             "max_new_tokens": 1024,
        #             "temperature": 0.7,
        #         },
        #     },
        # }
    },
    use_cache=False
)

In [None]:
print(distiset)
print(distiset['default']['train'].to_pandas())