In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [7]:
dataset = load_dataset("andresnowak/Instruction-finetuning-mnlp")

In [8]:
dataset["train"]["messages"][0][0]["content"]

'Provide a detailed analysis of Candace Parker\'s defensive techniques in her recent games, excluding the words "aggressive" and "blocking", in the format of a sports commentary script.'

In [19]:
def tokenize_function(example):
    val = example["messages"][0][0]["content"]

In [20]:
dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/264919 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints'],
        num_rows: 264919
    })
})

In [26]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base")

In [29]:
def format_instruction(example):
    """Extract the first user instruction and assistant response"""
    messages = example
    instruction = ""
    response = ""
    
    # Find first user message (instruction)
    for msg in messages:
        if msg['role'] == 'user':
            instruction = msg['content']
            break
    
    # Find corresponding assistant response
    for msg in messages:
        if msg['role'] == 'assistant':
            response = msg['content']
            break
    
    return {
        "input": instruction,
        "output": response,
        "formatted_text": f"### Instruction:\n{instruction}\n\n### Response:\n{response}"
    }

def tokenize_function(examples):
    # Format the examples
    formatted = [format_instruction(ex) for ex in examples["messages"]]
    
    # Tokenize inputs (instruction only)
    model_inputs = tokenizer(
        [f["input"] for f in formatted],
        truncation=True,
        max_length=1000,
        padding="max_length"
    )
    
    # Tokenize outputs (response only) for labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            [f["output"] for f in formatted],
            truncation=True,
            max_length=1000,
            padding="max_length"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/264919 [00:00<?, ? examples/s]



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 264919
    })
})

: 

In [3]:
dataset = load_dataset("allenai/tulu-3-sft-mixture")

README.md:   0%|          | 0.00/7.71k [00:00<?, ?B/s]

train-00000-of-00006.parquet:   0%|          | 0.00/361M [00:00<?, ?B/s]

train-00001-of-00006.parquet:   0%|          | 0.00/477M [00:00<?, ?B/s]

train-00002-of-00006.parquet:   0%|          | 0.00/147M [00:00<?, ?B/s]

train-00003-of-00006.parquet:   0%|          | 0.00/162M [00:00<?, ?B/s]

train-00004-of-00006.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

train-00005-of-00006.parquet:   0%|          | 0.00/116M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/939343 [00:00<?, ? examples/s]

In [20]:
dataset.unique("source")

{'train': ['ai2-adapt-dev/oasst1_converted',
  'ai2-adapt-dev/flan_v2_converted',
  'ai2-adapt-dev/tulu_hard_coded_repeated_10',
  'ai2-adapt-dev/no_robots_converted',
  'ai2-adapt-dev/tulu_v3.9_wildchat_100k',
  'ai2-adapt-dev/personahub_math_v5_regen_149960',
  'allenai/tulu-3-sft-personas-math-grade',
  'ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k',
  'ai2-adapt-dev/numinamath_tir_math_decontaminated',
  'ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k',
  'ai2-adapt-dev/personahub_code_v2_34999',
  'ai2-adapt-dev/evol_codealpaca_heval_decontaminated',
  'ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980',
  'ai2-adapt-dev/coconot_converted',
  'ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k',
  'ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k',
  'ai2-adapt-dev/tulu_v3.9_sciriff_10k',
  'ai2-adapt-dev/tulu_v3.9_table_gpt_5k',
  'ai2-adapt-dev/tulu_v3.9_aya_100k']}

In [22]:
import collections
# Vectorized approach
messages = dataset["train"]["messages"]
sources = dataset["train"]["source"]

# Count messages with length 2
count = sum(1 for msg in messages if len(msg) == 2)

# Count other lengths
lengths = [len(msg) for msg in messages if len(msg) != 2]
count_no = collections.Counter(lengths)

# Get bad datasets
bad_datasets = {source for msg, source in zip(messages, sources) if len(msg) != 2}

print(count, dict(count_no))


896090 {4: 17187, 6: 7583, 7: 649, 5: 35, 9: 68, 19: 2, 13: 4, 15: 3, 8: 4485, 11: 18, 21: 1, 40: 132, 24: 450, 12: 2153, 10: 2873, 16: 1130, 14: 1564, 30: 276, 26: 358, 44: 84, 100: 8, 60: 39, 28: 322, 46: 84, 32: 249, 18: 945, 72: 14, 54: 70, 36: 172, 34: 192, 38: 139, 64: 24, 22: 552, 42: 111, 20: 673, 84: 13, 62: 28, 70: 21, 90: 9, 48: 67, 96: 14, 76: 16, 68: 25, 66: 17, 50: 56, 58: 33, 78: 23, 134: 1, 52: 57, 56: 36, 82: 15, 174: 1, 138: 3, 124: 2, 88: 8, 148: 2, 128: 7, 92: 7, 120: 4, 198: 1, 188: 1, 98: 9, 74: 15, 80: 16, 110: 4, 118: 2, 102: 5, 94: 11, 208: 1, 182: 1, 136: 3, 104: 7, 116: 2, 202: 1, 194: 1, 122: 2, 106: 9, 112: 1, 166: 1, 108: 2, 140: 3, 168: 1, 142: 1, 178: 2, 146: 2, 144: 1, 130: 3, 226: 2, 86: 8, 196: 2, 150: 2, 234: 1, 256: 1, 186: 1, 126: 3, 222: 1, 214: 1, 158: 1, 264: 1, 260: 1, 184: 1, 294: 1, 160: 1, 114: 1, 200: 1, 154: 1}


In [23]:
print(sum(count_no.values()))
print(bad_datasets)

43253
{'ai2-adapt-dev/tulu_v3.9_wildchat_100k', 'ai2-adapt-dev/no_robots_converted', 'ai2-adapt-dev/oasst1_converted'}


In [24]:
import sys

size_bytes = sys.getsizeof(dataset)
print(f"Size in MB: {size_bytes / (1024 ** 2):.2f}")

Size in MB: 0.00


In [2]:
dataset = load_dataset("derek-thomas/ScienceQA", split="train")

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

(…)-00000-of-00001-1028f23e353fbe3e.parquet:   0%|          | 0.00/377M [00:00<?, ?B/s]

(…)-00000-of-00001-6c7328ff6c84284c.parquet:   0%|          | 0.00/126M [00:00<?, ?B/s]

(…)-00000-of-00001-f0e719df791966ff.parquet:   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12726 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4241 [00:00<?, ? examples/s]

In [10]:
dataset

Dataset({
    features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
    num_rows: 12726
})

In [8]:
dataset_no_images = dataset.filter(lambda x: x["image"] is None)

In [13]:
dataset_no_images.unique("subject")

Flattening the indices:   0%|          | 0/6508 [00:00<?, ? examples/s]

['language science', 'natural science', 'social science']

In [16]:
dataset = load_dataset("openlifescienceai/medmcqa", split="train")

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/936k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

In [17]:
dataset

Dataset({
    features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
    num_rows: 182822
})

In [18]:
dataset["cop"][0]

2