In [1]:
from datasets import load_dataset, Dataset
from jinja2 import Template
from transformers import AutoTokenizer
from pydantic import BaseModel, TypeAdapter
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

import json
import random
import traceback

In [2]:
NUM_GPUS = 4
BATCH_SIZE = 512

In [3]:
#persona_content_ds = load_dataset('amang1802/wiki_topic_persona_sampled_405B')['train']

In [4]:
def add_text_to_persona_ds(ds):
    id_to_text = {}
    for i in range(content_ds.num_rows):
        id_to_text[content_ds[i]['id']] = content_ds[i]['text']

    return ds.map(lambda idx: {"text": id_to_text[idx]}, input_columns=['id'])

In [5]:
#persona_content_ds = add_text_to_persona_ds(persona_content_ds)

In [6]:
def pick_one_per_persona_ds(ds):
    uniq_personas = list(set(ds['persona_id']))
    uniq_contents = list(set(ds['id']))
    included_pairs = [(cid, random.choice(uniq_personas)) for cid in uniq_contents]

    return ds.filter(lambda row: (row['id'], row['persona_id']) in included_pairs)

In [7]:
#persona_uniq_ds = pick_one_per_persona_ds(persona_content_ds)

In [8]:
#persona_uniq_ds

In [9]:
#assert persona_uniq_ds.num_rows == len(set(persona_uniq_ds['id']))

In [10]:
with open("gt_accuracy.jinja2") as f:
    template_str = f.read()

In [11]:
with open("few_shots.json") as f:
    examples_json = json.load(f)

In [12]:
for example in examples_json:
    example['matches'] = json.dumps(example['matches'], indent=2)

In [13]:
template = Template(template_str)

In [14]:
system_prompt = template.render(examples=examples_json)

In [15]:
print(system_prompt)

# Instructions

You are a fact checker and you're required to compare a pair of texts and find segments that discuss the same facts and judge if they both match on the facts. The goal is to only judge the alignment on facts stated by both. They can state unique facts which we have to ignore. For a pair of texts, output the a list of common segments and if they match or not.

On the inclusion of segments:
- Inspect every sentence in text1 and text2 and include all segments that discuss common facts.
- If one of the text has a segment with no similar segment in the other text, ignore that segment altogether.
- Do not pair segments that have different facts. For example: If one says Roger Federer won the Wimbledon in 2003, and the other says Roger Federer won the French Open in 2009 - they are different facts and shouldn't be paired together.
- Repeating this instruction: Include all segments that discuss common facts.

On the matching sensitivity:
- It's possible that two statements don'

In [16]:
model_id = "Qwen/Qwen2.5-72B-Instruct"

In [17]:
class Judgement(BaseModel):
    text1: str
    text2: str
    rationale: str
    match: bool

ta = TypeAdapter(list[Judgement])

json_schema = ta.json_schema()

In [18]:
llm = LLM(model=model_id, max_model_len=24576, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98)

INFO 12-28 05:34:33 config.py:478] This model supports multiple tasks: {'generate', 'embed', 'classify', 'score', 'reward'}. Defaulting to 'generate'.
INFO 12-28 05:34:33 config.py:1216] Defaulting to use mp for distributed inference
INFO 12-28 05:34:33 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='Qwen/Qwen2.5-72B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-72B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=24576, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=Fa

Loading safetensors checkpoint shards:   0% Completed | 0/37 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=167958)[0;0m INFO 12-28 05:35:09 model_runner.py:1097] Loading model weights took 33.9833 GB
INFO 12-28 05:35:09 model_runner.py:1097] Loading model weights took 33.9833 GB
[1;36m(VllmWorkerProcess pid=167957)[0;0m INFO 12-28 05:35:09 model_runner.py:1097] Loading model weights took 33.9833 GB
[1;36m(VllmWorkerProcess pid=167959)[0;0m INFO 12-28 05:35:09 model_runner.py:1097] Loading model weights took 33.9833 GB
[1;36m(VllmWorkerProcess pid=167958)[0;0m INFO 12-28 05:35:13 worker.py:241] Memory profiling takes 3.74 seconds
[1;36m(VllmWorkerProcess pid=167958)[0;0m [1;36m(VllmWorkerProcess pid=167957)[0;0m INFO 12-28 05:35:13 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
INFO 12-28 05:35:13 worker.py:241] Memory profiling takes 3.74 seconds
[1;36m(VllmWorkerProcess pid=167958)[0;0m [1;36m(VllmWorkerProcess pid=167957)[0;0m INFO 12-28 05:35:13 worker.py:241] model wei

In [19]:
def compute_gt_accuracy(gt_texts, synthetic_texts):
    messages = [[{"role": "system", "content": system_prompt},
                {"role": "user", "content": "text1:\n" + text1 + "\n\ntext2:\n" + text2 + "\n\nresponse:" }]
                for text1, text2 in zip(gt_texts, synthetic_texts)]

    guided_decoding_params = GuidedDecodingParams(json=json_schema)
    outputs = llm.chat(messages, SamplingParams(temperature=0.3, top_p=0.9, max_tokens=1536, guided_decoding=guided_decoding_params))

    judgements = []
    scores = []
    for output in outputs:
        response = output.outputs[0].text.strip()
        judgement = []
        score = -1.0
        try:
            judgement = json.loads(response)
            num_matches = sum([1 for j in judgement if j['match']])
            score = num_matches / len(judgement) if len(judgement) > 0 else -1
        except Exception:
            pass
            #print(traceback.format_exc())

        judgements.append(judgement)
        scores.append(score)
        
    return {
        "judgement": judgements,
        "accuracy_score": scores
    }   

In [20]:
def get_score(ds):
    valid_scores = [score for score in ds['accuracy_score'] if score >= 0]
    return sum(valid_scores) / len(valid_scores)

In [21]:
#judged_ds1 = persona_uniq_ds.map(compute_gt_accuracy, input_columns=['text', 'synthetic_content'], batched=True, batch_size=BATCH_SIZE)

In [22]:
#get_score(judged_ds1)

In [23]:
#judged_ds1.push_to_hub('amang1802/wiki_topic_persona_405B_uniq_gt_accuracy')

In [27]:
ds_list = [
    'amang1802/synthetic_data_topic_conditioned_L3.3_70B',
    'amang1802/synthetic_data_prefix_conditioned_L3.3_70B',
    'amang1802/synthetic_data_fulltext_conditioned_L3.3_70B'
]

In [None]:
scores = {}
for ds in ds_list:
    content_ds = load_dataset(ds)['train']
    judged_ds = content_ds.map(compute_gt_accuracy, input_columns=['text', 'synthetic_content'], batched=True, batch_size=BATCH_SIZE)
    judged_ds.push_to_hub(ds)
    scores[ds] = get_score(ds)

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/512 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/512 [01:41<14:25:08, 101.58s/it, est. speed input: 40.99 toks/s, output: 1.00 toks/s]
[Acessed prompts:   0% 2/512 [01:42<6:02:29, 42.65s/it, est. speed input: 83.15 toks/s, output: 2.07 toks/s]  
[Acessed prompts:   1% 3/512 [01:43<3:19:03, 23.46s/it, est. speed input: 122.73 toks/s, output: 3.15 toks/s]
[Acessed prompts:   1% 4/512 [01:45<2:08:09, 15.14s/it, est. speed input: 159.59 toks/s, output: 4.28 toks/s]
[Acessed prompts:   1% 6/512 [01:47<1:03:40,  7.55s/it, est. speed input: 241.06 toks/s, output: 6.67 toks/s]
[Acessed prompts:   2% 8/512 [01:48<38:29,  4.58s/it, est. speed input: 329.75 toks/s, output: 9.14 toks/s]  
[Acessed prompts:   2% 9/512 [01:49<30:31,  3.64s/it, est. speed input: 373.86 toks/s, output: 10.41 toks/s]

In [None]:
print(scores)