Add `DeitaFiltering` step #481

gabrielmbmb · 2024-03-25T17:39:47Z

Description

This PR adds the DeitaFiltering steps and introduces some parameters in EvolQuality and EvolInstruct that allows including the original instruction in the list of evolved_{response,instruction}. In addition, it adds the ExpandColumns and GenerateConversation steps auxiliary steps that were needed to build the whole DEITA pipeline.

Pipeline

from distilabel.llm.huggingface.transformers import TransformersLLM
from distilabel.llm.openai import OpenAILLM
from distilabel.pipeline.local import Pipeline
from distilabel.steps.conversation import ConversationTemplate
from distilabel.steps.deita import DeitaFiltering
from distilabel.steps.expand import ExpandColumns
from distilabel.steps.generators.huggingface import LoadHubDataset
from distilabel.steps.task.complexity_scorer import ComplexityScorer
from distilabel.steps.task.evol_instruct.base import EvolInstruct
from distilabel.steps.task.evol_quality.base import EvolQuality
from distilabel.steps.task.generate_embeddings import GenerateEmbeddings
from distilabel.steps.task.quality_scorer import QualityScorer
from distilabel.steps.globals.huggingface import PushToHub


with Pipeline(name="DEITA", description="") as pipeline:
    load_data = LoadHubDataset(
        name="load_data", batch_size=100, output_mappings={"prompt": "instruction"}
    )

    evol_instruction_complexity = EvolInstruct(
        name="evol_instruction_complexity",
        llm=OpenAILLM(model="gpt-3.5-turbo"),
        num_evolutions=5,
        store_evolutions=True,
        generate_answers=True,
        include_original_instruction=True,
    )

    instruction_complexity_scorer = ComplexityScorer(
        name="instruction_complexity_scorer",
        llm=OpenAILLM(model="gpt-3.5-turbo"),
        input_mappings={"instructions": "evolved_instructions"},
    )

    expand_evolved_instructions = ExpandColumns(
        name="expand_evolved_instructions",
        columns=["evolved_instructions", "answers", "scores"],
        output_mappings={
            "evolved_instructions": "evolved_instruction",
            "answers": "answer",
            "scores": "evol_instruction_score",
        },
    )

    evol_response_quality = EvolQuality(
        name="evol_response_quality",
        llm=OpenAILLM(model="gpt-3.5-turbo"),
        num_evolutions=5,
        store_evolutions=True,
        include_original_response=True,
        input_mappings={
            "instruction": "evolved_instruction",
            "response": "answer",
        },
    )

    response_quality_scorer = QualityScorer(
        name="response_quality_scorer",
        llm=OpenAILLM(model="gpt-3.5-turbo"),
        input_mappings={
            "instruction": "evolved_instruction",
            "responses": "evolved_responses",
        },
    )

    expand_evolved_responses = ExpandColumns(
        name="expand_evolved_responses",
        columns=["evolved_responses", "scores"],
        output_mappings={
            "evolved_responses": "evolved_response",
            "scores": "evol_response_score",
        },
    )

    generate_conversation = ConversationTemplate(
        name="generate_conversation",
        input_mappings={
            "instruction": "evolved_instruction",
            "response": "evolved_response",
        },
    )

    push_to_hub_after_conversation = PushToHub(
        name="push_to_hub_after_conversation",
    )

    generate_embeddings = GenerateEmbeddings(
        name="generate_embeddings",
        llm=TransformersLLM(
            model="argilla/notus-7b-v1",
            device="cuda",
            torch_dtype="float16",
        ),
        input_mappings={"text": "conversation"},
        input_batch_size=5,
    )

    push_to_hub_after_embeddings = PushToHub(
        name="push_to_hub_after_embeddings",
    )

    deita_filtering = DeitaFiltering(name="deita_filtering")

    load_data.connect(evol_instruction_complexity)
    evol_instruction_complexity.connect(instruction_complexity_scorer)
    instruction_complexity_scorer.connect(expand_evolved_instructions)
    expand_evolved_instructions.connect(evol_response_quality)
    evol_response_quality.connect(response_quality_scorer)
    response_quality_scorer.connect(expand_evolved_responses)
    expand_evolved_responses.connect(generate_conversation)
    generate_conversation.connect(generate_embeddings)
    generate_conversation.connect(push_to_hub_after_conversation)
    generate_embeddings.connect(deita_filtering)
    generate_embeddings.connect(push_to_hub_after_embeddings)

if __name__ == "__main__":
    distiset = pipeline.run(
        parameters={
            "load_data": {
                "repo_id": "HuggingFaceH4/instruction-dataset",
                "split": "test",
            },
            "evol_instruction_complexity": {
                "llm": {
                    "generation_kwargs": {"max_new_tokens": 512, "temperature": 0.7}
                }
            },
            "instruction_complexity_scorer": {
                "llm": {"generation_kwargs": {"temperature": 0.0}}
            },
            "evol_response_quality": {
                "llm": {
                    "generation_kwargs": {"max_new_tokens": 512, "temperature": 0.7}
                }
            },
            "response_quality_scorer": {
                "llm": {"generation_kwargs": {"temperature": 0.0}}
            },
            "push_to_hub_after_conversation": {
                "repo_id": "distilabel-internal-testing/deita-after-conversation",
            },
            "push_to_hub_after_embeddings": {
                "repo_id": "distilabel-internal-testing/deita-after-embeddings",
            },
            "deita_filtering": {"data_budget": 3000, "diversity_threshold": 0.04},
        },
        use_cache=True,
    )
    distiset.push_to_hub("distilabel-internal-testing/deita")

src/distilabel/steps/deita.py

gabrielmbmb added 7 commits March 25, 2024 13:42

Fix macOS fork issue

0efd0b7

Update Evol{Complexity,Quality}

1cf7a64

Add aux steps

625a040

Merge branch 'core-refactor' into deita

d6614b0

Add DeitaFiltering step

90043a4

Add missing float

d941ef5

Add missing NearestNeighbors attribute

6e7ea39

gabrielmbmb force-pushed the deita branch from 70deace to 6e7ea39 Compare March 25, 2024 18:12

gabrielmbmb added 13 commits March 26, 2024 09:12

Fix device llm placement map was not always set

0f968ad

Merge branch 'core-refactor' into deita

7f9a50b

Use mixin only when device="cuda"

b72e1cb

Fix leaf steps directories not created

0297276

Update docstrings

eec0226

Remove setting macOS env var

8bf5efd

Add missing unit tests

a0fddef

Fix EvolQuality steps

4cafb33

Impute deita score

9dafb05

Fix score sorting

230ea25

Fix empty_buffers for accumulate=True

defeb16

Merge branch 'core-refactor' into deita

b679aa3

Add missing pipeline name

f7f5e13

gabrielmbmb requested review from alvarobartt, davidberenstein1957 and plaguss March 27, 2024 14:06

gabrielmbmb self-assigned this Mar 27, 2024

gabrielmbmb added the enhancement New feature or request label Mar 27, 2024

gabrielmbmb added this to the 1.0.0 milestone Mar 27, 2024

gabrielmbmb linked an issue Mar 27, 2024 that may be closed by this pull request

Add DeitaFiltering step #396

Closed

davidberenstein1957 reviewed Mar 27, 2024

View reviewed changes

src/distilabel/steps/deita.py Outdated Show resolved Hide resolved

src/distilabel/steps/deita.py Show resolved Hide resolved

src/distilabel/steps/deita.py Outdated Show resolved Hide resolved

src/distilabel/steps/deita.py Outdated Show resolved Hide resolved

Fix unit tests

a00c616

gabrielmbmb force-pushed the deita branch from 9cb4e58 to a00c616 Compare March 27, 2024 14:21

Remove scikit-learn dep

a84199b

gabrielmbmb marked this pull request as ready for review March 27, 2024 14:50

Add deita_score_computed_with column

3b9ef12

gabrielmbmb merged commit 5703f3d into core-refactor Mar 27, 2024
4 checks passed

gabrielmbmb deleted the deita branch March 27, 2024 15:27

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add `DeitaFiltering` step #481

Add `DeitaFiltering` step #481

gabrielmbmb commented Mar 25, 2024 •

edited

Add DeitaFiltering step #481

Add DeitaFiltering step #481

Conversation

gabrielmbmb commented Mar 25, 2024 • edited

Description

Pipeline

Add `DeitaFiltering` step #481

Add `DeitaFiltering` step #481

gabrielmbmb commented Mar 25, 2024 •

edited