In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [2]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from pydantic import BaseModel, TypeAdapter
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

import json
import random
import traceback

In [3]:
NUM_GPUS = 1
BATCH_SIZE = 128

In [4]:
system_prompt = """
# Instructions

Simon loves Socrates and never takes any statement at face value. Unfortunately, that sometimes mean that nothing convinces him of the truth of any statement. 

- Your goal is to impersonate Simon.
- Assume all statements are false and made with a malicious intent to spread misinformation.
- Output as many as possible reasons why he might question its truth.
- These explanations are intelligent and rational.
- They are intended to motivate research to answer the truth.
- Keep these doubts short.
- Make sure any two doubts are different from each other. We don't want similar doubts appearing multiple times.

# Output Instructions

Reply in JSON using the following format.

[
  {
    "doubt": "string"
  },
  {
    "doubt": "string"
  },
  {
    "doubt": "string"
  },
...
]

# Example:

Speaker:
Kathleen Sullivan is the town of Warren’s substance abuse coordinator. She is also program director for The BAY Team, Barrington’s Prevention Coalition.

Context:
testimony before the House Judiciary Committee

Date:
April 29, 2015

Statement:
The proportion of Rhode Islanders entering substance abuse treatment primarily due to marijuana use has reached its highest point in 20 years.

Response:
[
  {
    "doubt": "Marijuana is legal now, and there are lots of studies which show that actually reduces substance abuse."
  },
  {
    "doubt": "While some drugs and opioids continue to cause a substance abuse problem, Marijuana substance has likely already peaked."
  },
  {
    "doubt": "It's a play on words and the truth maybe that it peaked sometime in the last 20 years."
  }
]
""".strip()

In [5]:
model_id = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8"

In [6]:
llm = LLM(model=model_id, max_model_len=8192, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98, max_num_seqs=BATCH_SIZE)

INFO 01-16 08:25:37 config.py:510] This model supports multiple tasks: {'classify', 'embed', 'reward', 'generate', 'score'}. Defaulting to 'generate'.
INFO 01-16 08:25:37 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8', speculative_config=None, tokenizer='neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_t

Loading safetensors checkpoint shards:   0% Completed | 0/15 [00:00<?, ?it/s]


INFO 01-16 08:25:50 model_runner.py:1099] Loading model weights took 67.6981 GB
INFO 01-16 08:25:52 worker.py:241] Memory profiling takes 1.49 seconds
INFO 01-16 08:25:52 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
INFO 01-16 08:25:52 worker.py:241] model weights take 67.70GiB; non_torch_memory takes 0.25GiB; PyTorch activation peak memory takes 1.69GiB; the rest of the memory reserved for KV Cache is 67.29GiB.
INFO 01-16 08:25:52 gpu_executor.py:76] # GPU blocks: 13780, # CPU blocks: 819
INFO 01-16 08:25:52 gpu_executor.py:80] Maximum concurrency for 8192 tokens per request: 26.91x
INFO 01-16 08:25:54 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_util

Capturing CUDA graph shapes: 100% 19/19 [00:08<00:00,  2.20it/s]

INFO 01-16 08:26:03 model_runner.py:1535] Graph capturing finished in 9 secs, took 0.34 GiB
INFO 01-16 08:26:03 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 12.96 seconds





In [7]:
class Doubt(BaseModel):
    doubt: str

ta = TypeAdapter(list[Doubt])

json_schema = ta.json_schema()

In [8]:
ds = load_dataset("chengxuphd/liar2")['train']
ds.num_rows

18369

In [9]:
ds = ds.filter(lambda row: None not in (row['speaker_description'], row['context'], row['date'], row['statement']))
ds.num_rows

18239

In [10]:
def create_doubts(speakers, contexts, dates, statements):
    messages = [[{"role": "system", "content": system_prompt},
                {"role": "user", "content": "Speaker:\n" + speaker  + "\n\nContext:\n" + context + "\n\nDate:\n" + date + "\n\nStatement:\n" + statement}]
                for speaker, context, date, statement in zip(speakers, contexts, dates, statements)]

    guided_decoding_params = GuidedDecodingParams(json=json_schema)
    outputs = llm.chat(messages, SamplingParams(temperature=0.3, top_p=0.9, max_tokens=1024, guided_decoding=guided_decoding_params))

    doubts = ""
    for output in outputs:
        response = output.outputs[0].text.strip()
        doubt = []
        try:
            doubt = json.loads(response)
        except Exception:
            pass
            #print(traceback.format_exc())

        doubts.append(doubt)
        
    return {
        "critical_doubts": doubts,
    }   

In [None]:
doubts_ds = ds.map(create_doubts, input_columns=['speaker_description', 'context', 'date', 'statement'], batched=True, batch_size=ds.num_rows)



Map:   0%|          | 0/18239 [00:00<?, ? examples/s]

INFO 01-16 08:26:04 chat_utils.py:333] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.



[Acessed prompts:   0% 0/18239 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/18239 [00:20<102:14:41, 20.18s/it, est. speed input: 23.98 toks/s, output: 9.36 toks/s]
[Acessed prompts:   0% 2/18239 [00:20<42:28:04,  8.38s/it, est. speed input: 48.41 toks/s, output: 18.66 toks/s]
[Acessed prompts:   0% 3/18239 [00:20<23:44:56,  4.69s/it, est. speed input: 70.83 toks/s, output: 27.72 toks/s]
[Acessed prompts:   0% 5/18239 [00:20<10:48:27,  2.13s/it, est. speed input: 115.18 toks/s, output: 45.68 toks/s]
[Acessed prompts:   0% 6/18239 [00:21<8:12:19,  1.62s/it, est. speed input: 135.92 toks/s, output: 54.23 toks/s] 
[Acessed prompts:   0% 8/18239 [00:21<4:51:24,  1.04it/s, est. speed input: 175.24 toks/s, output: 71.91 toks/s]
[Acessed prompts:   0% 9/18239 [00:21<3:54:45,  1.29it/s, est. speed input: 194.92 toks/s, output: 80.58 toks/s]
[Acessed prompts:   0% 10/18239 [00:21<3:09:10,  1.61it/s, est. speed input: 216.76 toks/s, outpu

In [None]:
doubts_ds.push_to_hub('amang1802/liar2-doubts')