In [1]:
from datasets import load_dataset, Dataset, load_from_disk
from jinja2 import Template
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import json
import re

In [2]:
NUM_GPUS = 4
BATCH_SIZE = 256

In [3]:
ds = load_dataset('HuggingFaceFW/fineweb-edu', name='sample-10BT', split='train', streaming=True)

Resolving data files:   0%|          | 0/2080 [00:00<?, ?it/s]

In [4]:
ds

IterableDataset({
    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
    num_shards: 14
})

In [5]:
system_prompt = """
Below is an extract from a web page. You are an AI content evaluator focused on assessing educational material's value for soft skills development. Soft skills include conversational ability, empathy, leadership skills, public speaking, confidence building, critical thinking, problem solving, professional writing, teamwork, digital literacy, professional attitude, work ethic, career management and intercultural fluency. 

You will analyze content using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:
- Add 1 point if the extract hows superficial coverage of basic communication and teamwork concepts without meaningful depth or practical application. Professional development opportunities are limited to theoretical knowledge, and problem-solving scenarios lack complexity or real-world context. Cultural awareness and digital literacy elements are either absent or extremely basic.
- Add another point if the extract specifically includes discussion of soft skills and includes straightforward communication scenarios and simple team dynamics, but lacks nuanced interaction or complex problem-solving opportunities. Professional development focuses on fundamental skills with limited practical application, while cultural awareness and digital literacy are present but superficial.
- Award a third point if the extract specifically includes discussion of soft skills andfeatures realistic scenarios that integrate emotional intelligence, leadership challenges, and critical thinking opportunities. Professional development includes practical applications with meaningful context, while incorporating cultural awareness and modern digital literacy skills throughout the material. 
- Grant a fourth point if the extract specifically includes discussion of soft skills and presents complex scenarios requiring sophisticated communication, strategic thinking, and advanced problem-solving across multiple contexts. Professional development opportunities are comprehensive and practical, with strong emphasis on intercultural fluency and technological adaptation.
- Bestow a fifth point if the extract specifically includes discussion of soft skills and seamlessly integrates advanced communication, leadership, and problem-solving scenarios that mirror real-world complexity. Professional development opportunities span multiple contexts with sophisticated cultural awareness, while digital literacy and practical application are woven throughout every element.

After examining the extract: 
- Briefly justify your total score, up to 100 words.
- Conclude with the score using the format: "Educational score: <total points>"
"""

In [6]:
model_id = "meta-llama/Llama-3.3-70B-Instruct"

In [7]:
llm = LLM(model=model_id, max_model_len=16384, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98)

INFO 01-15 06:41:26 config.py:510] This model supports multiple tasks: {'score', 'reward', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 01-15 06:41:26 config.py:1310] Defaulting to use mp for distributed inference
INFO 01-15 06:41:26 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='meta-llama/Llama-3.3-70B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.3-70B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect

Loading safetensors checkpoint shards:   0% Completed | 0/30 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=16614)[0;0m INFO 01-15 06:41:48 model_runner.py:1099] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=16615)[0;0m INFO 01-15 06:41:48 model_runner.py:1099] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=16613)[0;0m INFO 01-15 06:41:48 model_runner.py:1099] Loading model weights took 32.8892 GB
INFO 01-15 06:41:48 model_runner.py:1099] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=16615)[0;0m INFO 01-15 06:41:53 worker.py:241] Memory profiling takes 4.45 seconds
[1;36m(VllmWorkerProcess pid=16615)[0;0m INFO 01-15 06:41:53 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
[1;36m(VllmWorkerProcess pid=16615)[0;0m INFO 01-15 06:41:53 worker.py:241] model weights take 32.89GiB; non_torch_memory takes 3.70GiB; PyTorch activation peak memory takes 1.50GiB; the rest of the memory reserved for KV Cache is 98.83GiB.
[1;36m(V

Capturing CUDA graph shapes:   0% 0/35 [00:00<?, ?it/s]

[1;36m(VllmWorkerProcess pid=16614)[0;0m INFO 01-15 06:41:58 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
[1;36m(VllmWorkerProcess pid=16615)[0;0m INFO 01-15 06:41:58 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
[1;36m(VllmWorkerProcess 

Capturing CUDA graph shapes:  97% 34/35 [00:22<00:00,  1.52it/s]

[1;36m(VllmWorkerProcess pid=16615)[0;0m INFO 01-15 06:42:20 custom_all_reduce.py:224] Registering 5635 cuda graph addresses
[1;36m(VllmWorkerProcess pid=16614)[0;0m INFO 01-15 06:42:21 custom_all_reduce.py:224] Registering 5635 cuda graph addresses


Capturing CUDA graph shapes: 100% 35/35 [00:23<00:00,  1.52it/s]

INFO 01-15 06:42:21 custom_all_reduce.py:224] Registering 5635 cuda graph addresses





[1;36m(VllmWorkerProcess pid=16613)[0;0m INFO 01-15 06:42:21 custom_all_reduce.py:224] Registering 5635 cuda graph addresses
[1;36m(VllmWorkerProcess pid=16615)[0;0m INFO 01-15 06:42:21 model_runner.py:1535] Graph capturing finished in 24 secs, took 0.45 GiB
[1;36m(VllmWorkerProcess pid=16614)[0;0m INFO 01-15 06:42:21 model_runner.py:1535] Graph capturing finished in 24 secs, took 0.45 GiB
INFO 01-15 06:42:21 model_runner.py:1535] Graph capturing finished in 24 secs, took 0.45 GiB
[1;36m(VllmWorkerProcess pid=16613)[0;0m INFO 01-15 06:42:21 model_runner.py:1535] Graph capturing finished in 23 secs, took 0.45 GiB
INFO 01-15 06:42:21 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 32.88 seconds


In [8]:
pattern = r'Educational score: (\d+)\s*$'
def extract_score(text):
    match = re.search(pattern, text)
    score = 0
    if match:
        score = int(match.group(1))

    return score

In [9]:
def classify(texts):
    messages = [[{"role": "system", "content": system_prompt},
                {"role": "user", "content": "Extract:\n" + text[:50000] + "\n\nJustification:"}] for text in texts]
    outputs = llm.chat(messages, SamplingParams(temperature=0.25, top_p=0.9, max_tokens=256))
    responses = [output.outputs[0].text.strip() for output in outputs]
    scores =  [extract_score(response) for response in responses]
    return {"justification": responses, "classification_score": scores}

In [10]:
%logstart -rt classify_progress.log

Activating auto-logging. Current session state plus future input saved.
Filename       : classify_progress.log
Mode           : backup
Output logging : False
Raw input log  : True
Timestamping   : True
State          : active


In [None]:
total_count = 1000 * 1000
num_steps = 100
step_size = total_count // num_steps
for step in range(num_steps):
    print(f"Running step: {step}")
    
    cls_ds_stream = ds.take(step_size).map(classify, batched=True, batch_size=step_size, input_columns=["text"])
    cls_ds_list = list(cls_ds_stream)
    
    cls_ds = Dataset.from_list(cls_ds_list)
    cls_ds.save_to_disk(f"soft-skills-cls-{step}.hf")

    ds = ds.skip(step_size)

Running step: 0
INFO 01-15 06:42:24 chat_utils.py:333] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Processed prompts: 100% 10000/10000 [20:27<00:00,  8.15it/s, est. speed input: 11653.59 toks/s, output: 490.80 toks/s]


Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Running step: 1


Processed prompts: 100% 10000/10000 [20:32<00:00,  8.11it/s, est. speed input: 11658.43 toks/s, output: 497.48 toks/s]


Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Running step: 2


Processed prompts:   0% 22/10000 [00:37<52:53,  3.14it/s, est. speed input: 602.70 toks/s, output: 24.68 toks/s]  

In [None]:
%logstop