# Genstruct Technique

In [1]:
!pip install "distilabel[hf-transformers, openai]>=1.0.0"

Collecting distilabel>=1.0.0 (from distilabel[hf-transformers,openai]>=1.0.0)
  Downloading distilabel-1.3.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets>=2.16.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting httpx>=0.25.2 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting multiprocess>=0.70 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nest-asyncio>=1.6.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading nest_asyncio-1.6.0-py3-none-any.whl.metadata (2.8 kB)
Collecting orjson>=3.10.0 (from distilabel>=1.0.0->distilabel[hf-transformers,openai]>=1.0.0)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K   

In [2]:
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub, KeepColumns, LoadDataFromDicts
from distilabel.steps.tasks import Genstruct
from distilabel.llms import TransformersLLM

In [5]:
with Pipeline(name='genstruct-pipeline') as pipeline:
    load_data_from_hub = LoadDataFromHub(
        name='load-data-from-hub',
        output_mappings={'title': 'title', 'text': 'content'}
    )

    genstruct = Genstruct(
        llm=TransformersLLM(
            model='NousResearch/Genstruct-7B',
            device='cuda:0'
        ),
        output_mappings={"user": "instruction", "assistant": "response"}
    )

    keep_columns = KeepColumns(
        columns=['title', 'content', 'instruction', 'response']
    )

    load_data_from_hub >> genstruct >> keep_columns


In [7]:
distiset = pipeline.run(
    parameters={
        load_data_from_hub.name: {
            'repo_id': "ahsanirfan961/title-content-dataset",
            "split": "train"
        },
        genstruct.name: {
            "llm": {
                "generation_kwargs": {
                    "max_new_tokens": 256,
                    "temperature": 0.7
                }
            }
        }
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
print(distiset)

Distiset({
    default: DatasetDict({
        train: Dataset({
            features: ['title', 'content', 'instruction', 'response'],
            num_rows: 9
        })
    })
})


In [9]:
print(distiset['default']['train'].to_pandas())

                                               title  \
0  U.S. Navy plane crashes in Philippine Sea, thr...   
1  Zimbabwe's Mnangagwa arrives home, to be sworn...   
2  Cleveland officials to outline security for Re...   
3  U.S. senator calls for GAO probe to protect ba...   
4  Republican ideas for healthcare reforms could ...   
5  White House says 'couple of dozen' people stil...   
6  Democratic leaders Schumer, Pelosi to dine wit...   
7  Famine survey warns of thousands dying daily i...   
8  Qatar calls Trump's Jerusalem move 'death sent...   

                                             content  \
0  a us navy transport plane carrying  people cra...   
1  zimbabwe s former vice president emmerson mnan...   
2  the mayor and police chief of cleveland next w...   
3  the ranking democrat on the us senate subcommi...   
4  president donald trump’s push to fulfill a cam...   
5  the white house on sunday defended its impleme...   
6  us president donald trump invited senate dem