# Instruction Synthesizer Technique

In [1]:
!pip install "distilabel[hf-transformers, openai]>=1.0.0"

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub, KeepColumns, LoadDataFromDicts
from distilabel.steps.tasks import Genstruct
from distilabel.llms import TransformersLLM
from distilabel.steps import Step, StepInput
from distilabel.steps.typing import StepOutput
from typing import List
from pydantic import Field, PrivateAttr
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:

def parse_pred(pred):
    """Extract the list of instruction-response pairs from the prediction"""
    QA_str_list = pred.split('</END>')
    if not pred.endswith('</END>'):
        QA_str_list = QA_str_list[:-1]

    QA_list = []
    raw_questions = []
    for QA_str in QA_str_list:
        try:
            assert len(QA_str.split('<ANS>')) == 2, f'invalid QA string: {QA_str}'
            Q_str, A_str = QA_str.split('<ANS>')
            Q_str, A_str = Q_str.strip(), A_str.strip()
            assert Q_str.startswith('<QUE>'), f'invalid question string: {Q_str} in QA_str: {QA_str}'
            assert len(A_str) > 0, f'invalid answer string in QA_str: {QA_str}'
            Q_str = Q_str.replace('<QUE>', '').strip()
            assert Q_str.lower() not in raw_questions, f'duplicate question: {Q_str}'
            QA_list.append({'Q': Q_str, 'A': A_str})
            raw_questions.append(Q_str.lower())
        except:
            pass

    return QA_list

def get_instruction_response_pairs(context, model, tokenizer):
    '''Prompt the synthesizer to generate instruction-response pairs based on the given context'''
    prompt = f'<s> <CON> {context} </CON>\n\n'
    inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(input_ids=inputs, max_new_tokens=400, do_sample=False)[0]

    pred_start = int(inputs.shape[-1])
    pred = tokenizer.decode(outputs[pred_start:], skip_special_tokens=True)
    return parse_pred(pred)

In [4]:
# # Get the generated instruction-response paris
# instruction_response_pairs = get_instruction_response_pairs(context)

# # Print out the results
# print(f'# Context:\n{context}\n')
# for index, pair in enumerate(instruction_response_pairs):
#     print(f'## Instruction {index + 1}:\n{pair["Q"]}\n## Response {index + 1}:\n{pair["A"]}\n')

In [15]:
from transformers.models.mistral.modeling_mistral import MistralForCausalLM
from transformers.models.llama.tokenization_llama_fast import LlamaTokenizerFast

class InstructionSynthesizer(Step):

    def __init__(self, name: str):
        super().__init__(name=name)
        self._model = None
        self._tokenizer = None
    
    @property
    def inputs(self) -> List[str]:
        # Specify the input fields expected by this step
        return ['text']

    @property
    def outputs(self) -> List[str]:
        # Specify the output fields that this step will produce
        return ['instruction', 'response']

    def process(self, inputs: StepInput) -> StepOutput:
        if self._model is None:
            self._model = AutoModelForCausalLM.from_pretrained("instruction-pretrain/instruction-synthesizer").to('cuda')
            self._tokenizer = AutoTokenizer.from_pretrained("instruction-pretrain/instruction-synthesizer")
        results = []
        for example in inputs:
            context = example['text']
            instruction_response_pairs = get_instruction_response_pairs(context, self._model, self._tokenizer)
            print(instruction_response_pairs)
            text_index = inputs.index(example)
            for pair in instruction_response_pairs:
                results.append({"text": context, "instruction": pair["Q"], "response": pair["A"]})
        yield results

In [18]:
with Pipeline(name='synthesizer-pipeline') as pipeline:
    load_data_from_hub = LoadDataFromHub(
        name='load-data-from-hub',
        # output_mappings={'prompt': 'text'}
    )

    synthesizer = InstructionSynthesizer(
        name='synthesizer',
        input_batch_size = 1
    )
    
    load_data_from_hub >> synthesizer 


In [19]:
distiset = pipeline.run(
    parameters={
        load_data_from_hub.name: {
            'repo_id': "ahsanirfan961/title-content-dataset",
            "split": "train"
        },
    },
)

README.md:   0%|          | 0.00/299 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[{'Q': 'where did the us navy transport plane crash?', 'A': 'philippine sea south of japan'}, {'Q': 'how many people are missing?', 'A': 'three'}, {'Q': 'how many people were rescued?', 'A': 'eight'}, {'Q': 'what will happen to the missing people?', 'A': 'search and rescue efforts continue'}, {'Q': 'what is the name of the us navy transport plane?', 'A': 'greyhound'}, {'Q': 'how many crashes have there been this year?', 'A': 'two'}, {'Q': 'how many people were rescued and transferred to the carrier?', 'A': 'eight'}, {'Q': 'what is the number of people missing?', 'A': 'three'}]


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'Q': "when is he due to be sworn in?\nLet's think step by step.", 'A': 'zimbabwe s former vice president emmerson mnangagwa arrived back in the country on wednesday two days before he is due to be sworn in as president to replace robert mugabe ruling party zanupf official larry mavhima said mugabe resigned as zimbabwe s president on tuesday a week after the army and his former political allies moved to end four decades of rule by a man once feted as an independence hero who became feared as a despot\nTherefore, the answer is wednesday'}]


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'Q': 'what is cleveland preparing for', 'A': 'the republican national convention'}, {'Q': 'when is the convention', 'A': 'july'}, {'Q': 'what has been criticized', 'A': 'aspects of the preparations'}, {'Q': 'who has criticized the preparations', 'A': 'a civil rights group and a police union'}, {'Q': 'who is the police chief', 'A': 'calvin williams'}, {'Q': 'what will the police withhold', 'A': 'certain information'}, {'Q': 'why will they withhold information', 'A': 'to stymie anyone seeking to disrupt the convention'}, {'Q': 'how many officers will be assigned', 'A': 'unknown'}, {'Q': 'what did the mayor say', 'A': 'comprehensive overview of security preparations'}, {'Q': 'what has been accused', 'A': 'the city of taking too long to issue permits to protesters planning demonstrations at the convention'}, {'Q': 'how many demonstrations have there been', 'A': 'intense'}, {'Q': 'when are the demonstrations', 'A': 'at campaign stops'}, {'Q': 'what happened at the demonstrations', 'A': 'r

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'Q': 'what does a federal law call for?', 'A': 'calls on states to require that healthcare workers notify child protection services when a baby is born affected by illegal substance abuse or has symptoms of drug withdrawal such reports are not to be used as evidence of abuse the law says but rather to help develop a “plan of safe care” for the newborns after they leave the hospital'}, {'Q': 'how many states are not following this law?', 'A': 'reuters found that no more than nine states and the district of columbia appear to follow the law'}, {'Q': 'what does casey want?', 'A': 'a congressional watchdog agency to investigate whether states are complying with a federal law meant to protect newborns in drug withdrawal and help their families'}, {'Q': 'who is robert casey?', 'A': 'senator'}, {'Q': 'from what state?', 'A': 'pennsylvania'}, {'Q': 'what does the law say?', 'A': 'federal law calls on states to require that healthcare workers notify child protection services when a baby is bo

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'Q': "what is the foundation's opinion regarding states' financial strain?", 'A': 'the kaiser family foundation estimates a repeal of obamacare and a cap on federal medicaid spending such as through a block grant or a per capita cap could cut medicaid funding by  percent over the next decade that would likely handicap states’ ability to respond to larger enrollments during recessions'}, {'Q': 'what is the reason for the potential repeal of obamacare?', 'A': 'president donald trump’s push to fulfill a campaign promise to replace obamacare his predecessor’s signature healthcare plan with the help of a republicancontrolled congress'}, {'Q': 'why is there a push to replace obamacare?', 'A': 'obamacare formally known as the affordable care act graphic  here while republicans have not agreed to specific plans one idea gaining traction has been to convert the current system in which states share the cost of medicaid enrollees with the federal government into fixed payments or block grants s

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'Q': 'how many people were detained?', 'A': 'two dozen'}, {'Q': 'how many people remain in detention?', 'A': 'a couple dozen'}, {'Q': 'what is the number of people that remain in detention?', 'A': 'two dozen'}, {'Q': 'how many people were detained when they entered the united states on saturday?', 'A': 'two dozen'}]


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'Q': 'who did donald trump invite?', 'A': 'senate democratic leader chuck schumer and house democratic leader nancy pelosi'}, {'Q': 'who did president donald trump invite to dine with him on wednesday?', 'A': 'senate democratic leader chuck schumer and house democratic leader nancy pelosi'}, {'Q': 'what is the name of the act that would protect youth brought to the united states illegally?', 'A': 'dream act'}]


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'Q': 'who called the move a dangerous escalation?', 'A': "qatar's foreign ministry"}, {'Q': 'what did sheikh mohammed bin abdulrahman althani call the move?', 'A': 'a dangerous escalation'}]


Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
print(distiset)

Distiset({
    default: DatasetDict({
        train: Dataset({
            features: ['text', 'instruction', 'response'],
            num_rows: 46
        })
    })
})


In [21]:
print(distiset['default']['train'].to_pandas())

                                                 text  \
0   a us navy transport plane carrying  people cra...   
1   a us navy transport plane carrying  people cra...   
2   a us navy transport plane carrying  people cra...   
3   a us navy transport plane carrying  people cra...   
4   a us navy transport plane carrying  people cra...   
5   a us navy transport plane carrying  people cra...   
6   a us navy transport plane carrying  people cra...   
7   a us navy transport plane carrying  people cra...   
8   zimbabwe s former vice president emmerson mnan...   
9   the mayor and police chief of cleveland next w...   
10  the mayor and police chief of cleveland next w...   
11  the mayor and police chief of cleveland next w...   
12  the mayor and police chief of cleveland next w...   
13  the mayor and police chief of cleveland next w...   
14  the mayor and police chief of cleveland next w...   
15  the mayor and police chief of cleveland next w...   
16  the mayor and police chief 

In [13]:
print(distiset['default']['train'].to_pandas()['instruction'][0])

What type of energy does the author likely advocate for?
Options:
- wind energy
- solar energy
- no energy
- hydroelectric energy
- fossil fuels
- kinetic energy
- nuclear energy
- coal energy
Let's think step by step.


In [14]:
print(distiset['default']['train'].to_pandas()['response'][0])

Renewable energy sources such as solar, wind, and hydroelectric power are becoming increasingly important in the fight against climate change. These energy sources produce little to no greenhouse gas emissions, making them environmentally friendly alternatives to fossil fuels.
Therefore, the answer is solar energy
