# Exploring LLM Dataset Creation and Evaluation

## Install the required libraries

In [1]:
!pip install -qqq huggingface-hub argilla "distilabel[huggingface]" accelerate openai datasets

[0m

In [18]:
from datasets import load_dataset
import argilla as rg
from google.colab import userdata

# Authenticate with Argilla
dataset = load_dataset("DIBT/10k_prompts_ranked")
column_names = dataset["train"].column_names
print(column_names)

['prompt', 'quality', 'metadata', 'avg_rating', 'num_responses', 'agreement_ratio', 'raw_responses', 'kind', 'cluster_description', 'topic']


In [19]:
dataset['train'][0]

{'prompt': 'Provide step-by-step instructions on how to make a safe and effective homemade all-purpose cleaner from common household ingredients. The guide should include measurements, tips for storing the cleaner, and additional variations or scents that can be added. Additionally, the guide should be written in clear and concise language, with helpful visuals or photographs to aid in the process.',
 'quality': [{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb393a00',
   'value': '4',
   'status': 'submitted'},
  {'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e291889',
   'value': '4',
   'status': 'submitted'}],
 'metadata': '{"source": "ultrachat", "kind": "synthetic", "evolved_from": null}',
 'avg_rating': 5.0,
 'num_responses': 2,
 'agreement_ratio': 1.0,
 'raw_responses': [5, 5],
 'kind': 'synthetic',
 'cluster_description': 'Sustainable Packaging & Skin Care Products',
 'topic': 'Environmental Issues'}

In [20]:
import argilla as rg

# Initialize Argilla client

# Initialize the Argilla client

from uuid import uuid4
client = rg.Argilla(
    api_url=userdata.get('argilla_api_url'),
    api_key=userdata.get('argilla_api_key')
)

workspace = "argilla"
dataset_name = f"DIBT_sample_prompts_{uuid4()}"

# Create a new Dataset
dataset = rg.Dataset(
    name=dataset_name,
    workspace=workspace,
    client=client
)

# Configure the dataset settings
dataset.settings.fields = [
    rg.TextField(name="id"),
    rg.TextField(name="instruction"),
    rg.TextField(name="generation"),
]

dataset.settings.questions = [
    rg.LabelQuestion(
        name="quality",
        labels=["👎", "👍"],
        title="Quality of the generated text",
    )
]

# Create the dataset on the server
dataset.create()

print(f"New dataset '{dataset_name}' created in workspace '{workspace}'")

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/handlers.py", line 1475, in emit
    self.enqueue(self.prepare(record))
  File "/usr/lib/python3.10/logging/handlers.py", line 1436, in enqueue
    self.queue.put_nowait(record)
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 138, in put_nowait
    return self.put(obj, False)
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 88, in put
    raise ValueError(f"Queue {self!r} is closed")
ValueError: Queue <multiprocessing.queues.Queue object at 0x7e77bc2858a0> is closed
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/

New dataset 'DIBT_sample_prompts_ff3c49a6-e676-4536-92cc-0e22bfc568c9' created in workspace 'argilla'


In [6]:
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from pathlib import Path

# Set the model name
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Set the directory where you want to save the model
local_model_path = "/content/tinyllama-1.1b-chat"

# Download the model
print(f"Downloading {model_name} to {local_model_path}...")
snapshot_download(repo_id=model_name, local_dir=local_model_path)

# Load the tokenizer and model to verify the download
print("Loading the model to verify the download...")
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)

print(f"Model {model_name} has been successfully downloaded and loaded.")

# Print the size of the downloaded model
total_size = sum(f.stat().st_size for f in Path(local_model_path).glob('**/*') if f.is_file())
print(f"Total size of the downloaded model: {total_size / 1e9:.2f} GB")

Downloading TinyLlama/TinyLlama-1.1B-Chat-v1.0 to /content/tinyllama-1.1b-chat...


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Loading the model to verify the download...
Model TinyLlama/TinyLlama-1.1B-Chat-v1.0 has been successfully downloaded and loaded.
Total size of the downloaded model: 2.20 GB


In [21]:
# Filter the dataset to pick the highest quality responses
filtered_dataset = load_dataset("DIBT/10k_prompts_ranked", split="train").filter(
    lambda r: float(r["avg_rating"]) >= 4 and int(r["num_responses"]) >= 2
)

In [22]:
# View the features of your filtered dataset
filtered_dataset.features

{'prompt': Value(dtype='string', id='field'),
 'quality': [{'user_id': Value(dtype='string', id='question'),
   'value': Value(dtype='string', id='suggestion'),
   'status': Value(dtype='string', id='question')}],
 'metadata': Value(dtype='string', id='metadata'),
 'avg_rating': Value(dtype='float64', id=None),
 'num_responses': Value(dtype='int64', id=None),
 'agreement_ratio': Value(dtype='float64', id=None),
 'raw_responses': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'kind': Value(dtype='string', id=None),
 'cluster_description': Value(dtype='string', id=None),
 'topic': Value(dtype='string', id=None)}

## Option A: Use local LLMs to create your pipelines

In [15]:
from distilabel.llms import TransformersLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import (
    LoadDataFromDicts,
    TextGenerationToArgilla,
)
from distilabel.steps.tasks import TextGeneration
from datasets import load_dataset

# Subset your filtered dataset because of compute requirements. However, you can skip this step if you are not using a compute-constrained environment
filtered_dataset_10 = filtered_dataset.select(range(10))
filtered_dataset_10

Dataset({
    features: ['prompt', 'quality', 'metadata', 'avg_rating', 'num_responses', 'agreement_ratio', 'raw_responses', 'kind', 'cluster_description', 'topic'],
    num_rows: 10
})

In [12]:
# Create the pipeline
with Pipeline(
    name="prefs-with-tinyllama",
    description="Pipeline for building preference datasets using TinyLlama",
) as pipeline:
    load_dataset = LoadDataFromDicts(
        name="load_dataset",
        data=filtered_dataset_1,
        output_mappings={"prompt": "instruction"},
    )
    text_generation = TextGeneration(
        name="text_generation",
        llm=TransformersLLM(
            model=local_model_path,
            device_map="auto",  # This will use available GPU(s) efficiently
            torch_dtype="auto",  # This will use the appropriate dtype for the model
            trust_remote_code=True,  # This may be necessary for some models
            model_kwargs={
                "low_cpu_mem_usage": True,  # This can help with memory issues
            },
        ),
    )

    to_argilla = TextGenerationToArgilla(
        name="text_generation_to_argilla",
        dataset_name=dataset_name,
        dataset_workspace=workspace,
    )
    load_dataset >> text_generation >> to_argilla

# Run the pipeline
distiset = pipeline.run(
    parameters={
        "load_dataset": {
            "batch_size": 5,
        },
        "text_generation": {
            "llm": {
                "generation_kwargs": {
                    "max_new_tokens": 512,
                    "temperature": 0.7,
                    "do_sample": True,
                    "top_p": 0.95,
                    "top_k": 50,
                }
            }
        },
        "text_generation_to_argilla": {
            "api_url": userdata.get('argilla_api_url'),
            "api_key": userdata.get('argilla_api_key'),
            "dataset_name": dataset_name,
            "dataset_workspace": workspace,
        },
    }
)

## Option B: Use OpenAI LLM to create your pipeline

In [23]:
from distilabel.llms import OpenAILLM
import os

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [28]:
filtered_dataset_10.features
#dataset_name

{'prompt': Value(dtype='string', id='field'),
 'quality': [{'user_id': Value(dtype='string', id='question'),
   'value': Value(dtype='string', id='suggestion'),
   'status': Value(dtype='string', id='question')}],
 'metadata': Value(dtype='string', id='metadata'),
 'avg_rating': Value(dtype='float64', id=None),
 'num_responses': Value(dtype='int64', id=None),
 'agreement_ratio': Value(dtype='float64', id=None),
 'raw_responses': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'kind': Value(dtype='string', id=None),
 'cluster_description': Value(dtype='string', id=None),
 'topic': Value(dtype='string', id=None)}

In [None]:
# Create the pipeline
with Pipeline(
    name="prefs-with-openai",
    description="Pipeline for building preference datasets using OpenAI",
) as pipeline:
    load_dataset = LoadDataFromDicts(
        name="load_dataset",
        data=filtered_dataset_10,
        output_mappings={"prompt": "instruction"},
    )
    text_generation = TextGeneration(
        name="text_generation",
        llm=OpenAILLM(model="gpt-4")
    )

    to_argilla = TextGenerationToArgilla(
        name="text_generation_to_argilla",
        dataset_name=dataset_name,
        dataset_workspace=workspace,
    )
    load_dataset >> text_generation >> to_argilla

# Run the pipeline
distiset = pipeline.run(
    parameters={
        "load_dataset": {
            "batch_size": 16,
        },
        "text_generation": {
            "llm": {
                "generation_kwargs": {
                    "temperature": 0.7,
                }
            }
        },
        "text_generation_to_argilla": {
            "api_url": userdata.get('argilla_api_url'),
            "api_key": userdata.get('argilla_api_key'),
            "dataset_name": dataset_name,
            "dataset_workspace": workspace,
        },
    }
)

## Install Eleuther Evaluation Harness

In [None]:
%%bash
git clone https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness
pip install -e .

## Evaluate the LLM using Eleuther Evaluation Harness

In [None]:
%%bash
lm_eval --model hf \
    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
    --tasks hellaswag \
    --device cuda \
    --batch_size auto:4 \
    --output_path hellaswag_test