In [1]:
import os

In [2]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from pydantic import BaseModel, TypeAdapter
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

import json
import random
import traceback

In [3]:
NUM_GPUS = 4
BATCH_SIZE = 512

In [4]:
system_prompt = """
# Instructions

Simon loves Socrates and never takes any statement at face value. Unfortunately, that sometimes mean that nothing convinces him of the truth of any statement. 

- Your goal is to impersonate Simon.
- Assume all statements are false and made with a malicious intent to spread misinformation.
- Output as many as possible reasons why he might question its truth.
- These explanations are intelligent and rational.
- They are intended to motivate research to answer the truth.
- Keep these doubts short.
- Make sure any two doubts are different from each other. We don't want similar doubts appearing multiple times.

# Output Instructions

Reply in JSON using the following format.

[
  {
    "doubt": "string"
  },
  {
    "doubt": "string"
  },
  {
    "doubt": "string"
  },
...
]

# Example:

Speaker:
Kathleen Sullivan is the town of Warren’s substance abuse coordinator. She is also program director for The BAY Team, Barrington’s Prevention Coalition.

Context:
testimony before the House Judiciary Committee

Date:
April 29, 2015

Statement:
The proportion of Rhode Islanders entering substance abuse treatment primarily due to marijuana use has reached its highest point in 20 years.

Response:
[
  {
    "doubt": "Marijuana is legal now, and there are lots of studies which show that actually reduces substance abuse."
  },
  {
    "doubt": "While some drugs and opioids continue to cause a substance abuse problem, Marijuana substance has likely already peaked."
  },
  {
    "doubt": "It's a play on words and the truth maybe that it peaked sometime in the last 20 years."
  }
]
""".strip()

In [5]:
model_id = "meta-llama/Llama-3.3-70B-Instruct"

In [6]:
llm = LLM(model=model_id, max_model_len=8192, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98, max_num_seqs=BATCH_SIZE)

INFO 01-23 17:13:44 config.py:905] Defaulting to use mp for distributed inference
INFO 01-23 17:13:44 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='meta-llama/Llama-3.3-70B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.3-70B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-3

Loading safetensors checkpoint shards:   0% Completed | 0/30 [00:00<?, ?it/s]


INFO 01-23 17:14:02 model_runner.py:1067] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=196764)[0;0m INFO 01-23 17:14:02 model_runner.py:1067] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=196765)[0;0m INFO 01-23 17:14:02 model_runner.py:1067] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=196763)[0;0m INFO 01-23 17:14:02 model_runner.py:1067] Loading model weights took 32.8892 GB
INFO 01-23 17:14:06 distributed_gpu_executor.py:57] # GPU blocks: 77774, # CPU blocks: 3276
INFO 01-23 17:14:06 distributed_gpu_executor.py:61] Maximum concurrency for 8192 tokens per request: 151.90x
INFO 01-23 17:14:08 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-23 17:14:08 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you 

In [7]:
class Doubt(BaseModel):
    doubt: str

ta = TypeAdapter(list[Doubt])

json_schema = ta.json_schema()

In [8]:
ds = load_dataset("chengxuphd/liar2")['train']
ds.num_rows

18369

In [9]:
ds = ds.filter(lambda row: None not in (row['speaker_description'], row['context'], row['date'], row['statement']))
ds.num_rows

18239

In [23]:
def create_doubts(speakers, contexts, dates, statements):
    messages = [[{"role": "system", "content": system_prompt},
                {"role": "user", "content": "Speaker:\n" + speaker  + "\n\nContext:\n" + context + "\n\nDate:\n" + date + "\n\nStatement:\n" + statement}]
                for speaker, context, date, statement in zip(speakers, contexts, dates, statements)]

    guided_decoding_params = GuidedDecodingParams(json=json_schema)
    outputs = llm.chat(messages, SamplingParams(temperature=0.3, top_p=0.9, max_tokens=1024, guided_decoding=guided_decoding_params))

    doubts = []
    for output in outputs:
        response = output.outputs[0].text.strip()
        doubt = []
        try:
            doubt = json.loads(response)
        except Exception:
            pass
            #print(traceback.format_exc())

        doubts.append(doubt)
        
    return {
        "critical_doubts": doubts,
    }

In [24]:
doubts_ds = ds.select(range(32)).map(create_doubts, input_columns=['speaker_description', 'context', 'date', 'statement'], batched=True, batch_size=ds.num_rows)

Map:   0%|          | 0/32 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   3% 1/32 [00:01<00:38,  1.23s/it, est. speed input: 368.31 toks/s, output: 4.07 toks/s]
[Acessed prompts:  94% 30/32 [00:07<00:00,  4.08it/s, est. speed input: 1928.89 toks/s, output: 46.95 toks/s]
[Acessed prompts:  97% 31/32 [00:08<00:00,  3.92it/s, est. speed input: 1871.70 toks/s, output: 72.95 toks/s]
Processed prompts: 100% 32/32 [00:08<00:00,  3.80it/s, est. speed input: 1863.43 toks/s, output: 100.21 toks/s]


In [25]:
doubts_ds[0]

{'id': 13847,
 'label': 5,
 'statement': '90 percent of Americans "support universal background checks" for gun purchases.',
 'date': 'October 2, 2017',
 'subject': 'government regulation;polls and public opinion;guns',
 'speaker': 'chris abele',
 'speaker_description': 'Chris Abele is Milwaukee County Executive, a position he won in an April 2011 special election to finish out the final year of the term of Scott Walker, who was elected governor in November 2010. The election was the first attempt at political office for Abele, a Milwaukee philanthropist and business owner.\r\nThe office is nonpartisan, but Abele has indicated he is a Democrat.',
 'state_info': 'wisconsin',
 'true_counts': 1,
 'mostly_true_counts': 4,
 'half_true_counts': 5,
 'mostly_false_counts': 3,
 'false_counts': 5,
 'pants_on_fire_counts': 2,
 'context': 'a tweet',
 'justification': '"Universal" is the term for background checks to be done on every gun sale. We found support for that policy at 94 percent in the l

In [26]:
doubts_ds.push_to_hub('amang1802/liar2-doubts')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/liar2-doubts/commit/e3e0b881b715c00ebad4952fdcbbd537ea87ffef', commit_message='Upload dataset', commit_description='', oid='e3e0b881b715c00ebad4952fdcbbd537ea87ffef', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/liar2-doubts', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/liar2-doubts'), pr_revision=None, pr_num=None)