In [24]:
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, load_from_disk, concatenate_datasets

from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

import json
import re
import os
import math
import pandas as pd

In [2]:
NUM_GPUS = 2
BATCH_SIZE = 128

In [3]:
ds = load_dataset('amang1802/wikipedia_controversial_sections')['train']

In [4]:
ds

Dataset({
    features: ['article_title', 'url', 'section_title', 'section_text'],
    num_rows: 184160
})

In [5]:
model_id = "google/gemma-3-27b-it"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [7]:
df = ds.to_pandas()

In [8]:
df['section_length'] = df.apply(lambda row: len(row['section_text']), axis=1)

In [9]:
len(tokenizer(df.iloc[df['section_length'].argmax()].section_text)['input_ids'])

14316

In [10]:
system_prompt = """
# Task: Analyze Wikipedia Text for Divergent Issues
You will be given an excerpt from a Wikipedia article with a title, section name, and text content. Your task is to determine if the text discusses a divergent issue and assess its scale.


# Definition of a Divergent Issue:
A divergent issue is a topic where:

Different groups of people hold substantially different viewpoints, opinions, or positions
There is legitimate debate, controversy, or disagreement about the topic
The disagreement stems from different values, priorities, interpretations, or interests (not just factual errors)

# Scoring Scale (1-5):

0: Not a divergent issue at all
1: Very niche issue with minimal community interest (affects only a tiny, specialized group)
2: Limited issue affecting a small but identifiable community (local controversy, specialized field debate)
3: Moderate issue with clear stakeholder groups (regional issue, specific industry concern, particular demographic)
4: Reasonably well-known issue with broad community interest (national debates, major industry concerns, widespread social issues)
5: Major societal issue with widespread recognition (global concerns, fundamental rights, major political/social divides)

# Output Format:
Respond with a JSON object containing exactly these fields:

{
  "divergent_issue_rationale": "[Explain why this is or isn't a divergent issue, starting with 'This text discusses a **divergent issue**...' or 'This text does **not** contain discussion of a divergent issue...']",
  "is_divergent_issue": [true or false],
  "divergent_issue_scale_rationale": "[If divergent issue: explain the scale rating considering community size, geographic scope, and societal impact. If not divergent issue: exactly 'NA']",
  "divergent_issue_scale": [0-5, where 0 means not a divergent issue]
}
"""

In [11]:
len(tokenizer(system_prompt)['input_ids'])

397

In [12]:
os.environ['VLLM_USE_FLASHINFER'] = '0'

In [13]:
llm = LLM(model=model_id, max_model_len=16384, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98)

INFO 06-17 19:19:55 [__init__.py:30] Available plugins for group vllm.general_plugins:
INFO 06-17 19:19:55 [__init__.py:32] name=lora_filesystem_resolver, value=vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 06-17 19:19:55 [__init__.py:34] all available plugins for group vllm.general_plugins will be loaded.
INFO 06-17 19:19:55 [__init__.py:36] set environment variable VLLM_PLUGINS to control which plugins to load.
INFO 06-17 19:19:55 [__init__.py:44] plugin lora_filesystem_resolver loaded.
INFO 06-17 19:20:02 [config.py:787] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 06-17 19:20:02 [config.py:1869] Defaulting to use mp for distributed inference
INFO 06-17 19:20:02 [config.py:2112] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 06-17 19:20:04 [core.py:427] Waiting for init message from front-end.
INFO 06-17 19:20:04 [core.py:61] Initializing a V1 LLM engin

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1;36m(VllmWorker rank=0 pid=757991)[0;0m INFO 06-17 19:20:04 [shm_broadcast.py:266] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_93c49d23'), local_subscribe_addr='ipc:///tmp/999c0449-e778-4380-9c39-edd98a719f02', remote_subscribe_addr=None, remote_addr_ipv6=False)
[1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:04 [shm_broadcast.py:266] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_53aac81f'), local_subscribe_addr='ipc:///tmp/afb9fe5a-00b0-49da-9652-fa675d354340', remote_subscribe_addr=None, remote_addr_ipv6=False)


[W617 19:20:06.177721447 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W617 19:20:06.250432617 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W617 19:20:06.250773476 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


[1;36m(VllmWorker rank=0 pid=757991)[0;0m [1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:06 [utils.py:1071] Found nccl from library libnccl.so.2
INFO 06-17 19:20:06 [utils.py:1071] Found nccl from library libnccl.so.2
[1;36m(VllmWorker rank=0 pid=757991)[0;0m [1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:06 [pynccl.py:69] vLLM is using nccl==2.26.2
INFO 06-17 19:20:06 [pynccl.py:69] vLLM is using nccl==2.26.2


[W617 19:20:06.461040835 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W617 19:20:06.461202404 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


[1;36m(VllmWorker rank=0 pid=757991)[0;0m [1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:07 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 06-17 19:20:07 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
[1;36m(VllmWorker rank=0 pid=757991)[0;0m INFO 06-17 19:20:07 [shm_broadcast.py:266] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_5ac88801'), local_subscribe_addr='ipc:///tmp/30a31a38-59fd-4a80-a8d7-6fa8a0f0be5d', remote_subscribe_addr=None, remote_addr_ipv6=False)
[1;36m(VllmWorker rank=0 pid=757991)[0;0m [1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:07 [parallel_state.py:1079] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 06-17 19:20:07 [parallel_state.py:1079] rank 1 in world size 2 is assigned as DP rank

[1;36m(VllmWorker rank=0 pid=757991)[0;0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
[1;36m(VllmWorker rank=1 pid=757992)[0;0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


[1;36m(VllmWorker rank=0 pid=757991)[0;0m INFO 06-17 19:20:11 [gpu_model_runner.py:1503] Starting to load model google/gemma-3-27b-it...
[1;36m(VllmWorker rank=0 pid=757991)[0;0m INFO 06-17 19:20:11 [cuda.py:216] Using Flash Attention backend on V1 engine.
[1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:11 [gpu_model_runner.py:1503] Starting to load model google/gemma-3-27b-it...
[1;36m(VllmWorker rank=0 pid=757991)[0;0m INFO 06-17 19:20:11 [backends.py:37] Using InductorAdaptor
[1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:11 [cuda.py:216] Using Flash Attention backend on V1 engine.
[1;36m(VllmWorker rank=0 pid=757991)[0;0m INFO 06-17 19:20:11 [weight_utils.py:291] Using model weights format ['*.safetensors']
[1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:12 [backends.py:37] Using InductorAdaptor


Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:12 [weight_utils.py:291] Using model weights format ['*.safetensors']
[1;36m(VllmWorker rank=0 pid=757991)[0;0m INFO 06-17 19:20:19 [default_loader.py:279] Loading weights took 7.28 seconds
[1;36m(VllmWorker rank=0 pid=757991)[0;0m INFO 06-17 19:20:19 [gpu_model_runner.py:1521] Model loading took 25.9044 GiB and 7.641194 seconds
[1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:20 [default_loader.py:279] Loading weights took 8.11 seconds
[1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:20 [gpu_model_runner.py:1521] Model loading took 25.9044 GiB and 8.529004 seconds
[1;36m(VllmWorker rank=0 pid=757991)[0;0m [1;36m(VllmWorker rank=1 pid=757992)[0;0m INFO 06-17 19:20:20 [gpu_model_runner.py:1823] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 64 image items of the maximum feature size.
INFO 06-17 19:20:20 [gpu_model_runner.py:1823] Encoder cache will be initiali

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [14]:
def format_section(title, section, text):
    return f"Title: {title}\n\nSection: {section}\n\nText: {text}"

In [15]:
pattern = r'Educational score: (\d+)\s*$'
def extract_score(response):
    try:
        data = json.loads(response)
        score = data.get('divergent_issue_scale')
    except ValueError:
        data = {}
        score = 0

    return data, score

In [16]:
OUTPUT_JSON_SCHEMA = {
  "type": "object",
  "properties": {
    "divergent_issue_rationale": {
      "type": "string"
    },
    "is_divergent_issue": {
      "type": "boolean"
    },
    "divergent_issue_scale_rationale": {
      "type": "string"
    },
    "divergent_issue_scale": {
      "type": "integer",
      "minimum": 0,
      "maximum": 5
    }
  },
  "required": [
    "divergent_issue_rationale",
    "is_divergent_issue",
    "divergent_issue_scale_rationale",
    "divergent_issue_scale"
  ],
  "additionalProperties": False
}

In [17]:
def classify(titles, sections, texts):
    messages = [[{"role": "system", "content": system_prompt},
                {"role": "user", "content": format_section(title, section, text)}] for title, section, text in zip(titles, sections, texts)]
    
    outputs = llm.chat(messages, SamplingParams(temperature=0.25, max_tokens=512, guided_decoding=GuidedDecodingParams(json=OUTPUT_JSON_SCHEMA)))
    responses = [output.outputs[0].text.strip() for output in outputs]
    datas, scores =  zip(*[extract_score(response) for response in responses])
    
    return {"classification_json": list(datas), "issue_scale": list(scores)}

In [19]:
total_count = ds.num_rows
num_steps = 25
step_size = math.ceil(total_count / num_steps)

for step in range(24, num_steps):
    print(f"Running step: {step}")

    start_i = step * step_size
    end_i = (step+1) * step_size
    

    cls_ds = ds.select(range(start_i, min(end_i, ds.num_rows))).map(classify, batched=True, batch_size=step_size, input_columns=['article_title', 'section_title', 'section_text'])
    cls_ds.save_to_disk(f"wiki-issue-{step}.hf")

Running step: 24


Map:   0%|          | 0/7352 [00:00<?, ? examples/s]

Adding requests:   0%|          | 0/7352 [00:00<?, ?it/s]

Processed prompts:   0% 0/7352 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Saving the dataset (0/1 shards):   0%|          | 0/7352 [00:00<?, ? examples/s]

In [21]:
import glob

files = glob.glob("wiki-issue-*")

In [23]:
processed_ds = []
for f in files:
    processed_ds.append(load_from_disk(f))

In [25]:
combined_ds = concatenate_datasets(processed_ds)

In [26]:
combined_ds.num_rows

184160

In [27]:
combined_ds[0]

{'article_title': '1984 (magazine)',
 'url': 'https://en.wikipedia.org/wiki/1984_(magazine)',
 'section_title': 'Controversies',
 'section_text': 'One of the most notable incidents that occurred regarding the magazine was an unauthorized adaptation of Harlan Ellison \'s short story, "A Boy and His Dog", which has been rumored as one of the major factors in the bankruptcy of Warren Publishing. As discussed in the book The Warren Companion, editor Bill Dubay approached writers Gerry Boudreau and Jim Stenstrum about adapting science fiction stories for the magazine. Boudreau asked permission to adapt Ellison\'s story, and Dubay approved this, without first asking Ellison. When Ellison refused to grant permission, Dubay had artist Alex Niño draw the story anyway, then provided the art to Stenstrum to use as the basis for a new story. The story was published in issue #4, under the title "Mondo Megillah". Despite Stenstrum\'s reworking of the script, the basic story was still obvious plagiar

In [28]:
combined_ds.push_to_hub('amang1802/wikipedia_controversial_sections')

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/93 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



Creating parquet from Arrow format:   0%|          | 0/93 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/wikipedia_controversial_sections/commit/d232eef82c833816ba96d8e6e752948727cac9f5', commit_message='Upload dataset', commit_description='', oid='d232eef82c833816ba96d8e6e752948727cac9f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/wikipedia_controversial_sections', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/wikipedia_controversial_sections'), pr_revision=None, pr_num=None)