In [118]:
from datasets import load_dataset, Dataset
from jinja2 import Template
from pydantic import BaseModel, TypeAdapter
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

import json
import traceback

In [102]:
NUM_TOPICS = 1024 * 10
NUM_GPUS = 4
NUM_DUPS = 1

In [103]:
ds = load_dataset('wikimedia/wikipedia', name='20231101.en', split='train', streaming=True)

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [104]:
with open("qna_oneturn.jinja2") as f:
    template_str = f.read()

In [105]:
with open("qna_shots.json") as f:
    content_json = json.load(f)

for idx, item in enumerate(content_json):
    content_json[idx]['qna_json_str'] = json.dumps(item['qna'], indent=2)

In [106]:
template = Template(template_str)

In [107]:
system_prompt = template.render(examples=content_json)

In [108]:
print(system_prompt)

# Instructions

You are a pedogogical expert is going to construct questions and answers from user provided text. Construct questions that are helpful to students who are learning about the topic and provide meaningful insights in your answers. They should help them learn facts, understand concepts, and also connect different facts and concepts. Use illustrative examples and analogies to explain concepts.

Output as many questions and answer pairs as you can. Ensure every little detail or concept is discussed in at least one question and answer pair. It's fine if multiple questions discuss the same or similar details from the text. Generate upto 100 questions and answer pairs.

# Output

Respond in a JSON in the following format:

[
  {
    "question": "string",
    "answer": "string",
  },
  ...
]

# Examples
The following examples show the types of questions and answers to generate for a given text. They are only a small sample of relevant question and answer pairs that should be gen

In [10]:
model_id = "meta-llama/Llama-3.3-70B-Instruct"

In [11]:
llm = LLM(model=model_id, max_model_len=49152, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98)

INFO 01-05 16:56:57 config.py:478] This model supports multiple tasks: {'generate', 'score', 'embed', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 01-05 16:56:58 config.py:1216] Defaulting to use mp for distributed inference
INFO 01-05 16:56:58 config.py:1364] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 01-05 16:56:58 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='meta-llama/Llama-3.3-70B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.3-70B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=49152, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backe

Loading safetensors checkpoint shards:   0% Completed | 0/30 [00:00<?, ?it/s]


INFO 01-05 16:57:22 model_runner.py:1097] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=502663)[0;0m INFO 01-05 16:57:22 model_runner.py:1097] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=502662)[0;0m INFO 01-05 16:57:22 model_runner.py:1097] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=502664)[0;0m INFO 01-05 16:57:22 model_runner.py:1097] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=502663)[0;0m INFO 01-05 16:57:25 worker.py:241] Memory profiling takes 2.99 seconds
[1;36m(VllmWorkerProcess pid=502663)[0;0m INFO 01-05 16:57:25 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
[1;36m(VllmWorkerProcess pid=502663)[0;0m INFO 01-05 16:57:25 worker.py:241] model weights take 32.89GiB; non_torch_memory takes 4.17GiB; PyTorch activation peak memory takes 0.21GiB; the rest of the memory reserved for KV Cache is 99.65GiB.
[1

In [109]:
def dup_list(l):
    l_dup = [l] * NUM_DUPS
    l_flat = [l_dup[i][j] for j in range(len(l)) for i in range(NUM_DUPS)]

    return l_flat

In [110]:
class QnA(BaseModel):
    question: str
    answer: str

ta = TypeAdapter(list[QnA])

json_schema = ta.json_schema()

In [111]:
qna_jinja = """
{% for qa in qna %}
Question: {{ qa['question'] }}
Answer: {{ qa['answer'] }}
{% endfor %}
""".strip()

qna_template = Template(qna_jinja)

In [119]:
def generate_content(ids, urls, titles, texts):
    messages = [[{"role": "system", "content": system_prompt},
                {"role": "user", "content": "Text:\n" + text + "\n\nOutput JSON:"}]
                for title, text in zip(titles, texts)]

    dup_messages = dup_list(messages)
    dup_ids = dup_list(ids)
    dup_urls = dup_list(urls)
    dup_titles = dup_list(titles)
    dup_texts = dup_list(texts)

    guided_decoding_params = GuidedDecodingParams(json=json_schema)
    outputs = llm.chat(dup_messages, SamplingParams(temperature=0.9, top_p=0.9, max_tokens=16384, guided_decoding=guided_decoding_params))

    qnas = []
    for output in outputs:
        response = output.outputs[0].text.strip()
        qna = []
        try:
            qna = json.loads(response)
        except Exception:
            pass
            #print(response)
            #print(traceback.format_exc())

        qnas.append(qna)

    return {
        "id": dup_ids,
        "url": dup_urls,
        "title": dup_titles,
        "text": dup_texts,
        "synthetic_content": [f"**Title**\n{title}\n\n**Q&A**{qna_template.render(qna=qna)}" for title, qna in zip(dup_titles, qnas)]
    }

In [120]:
syn_ds_stream = ds.shuffle(seed=1998, buffer_size=1000_000).take(NUM_TOPICS).map(generate_content, batched=True, batch_size=NUM_TOPICS, input_columns=["id", "url", "title", "text"])

In [121]:
syn_ds_list = list(syn_ds_stream)

Processed prompts: 100% 10240/10240 [4:26:35<00:00,  1.56s/it, est. speed input: 8411.16 toks/s, output: 346.40 toks/s] 


In [122]:
syn_ds = Dataset.from_list(syn_ds_list)
syn_ds.push_to_hub('amang1802/synthetic_data_qna_fulltext_conditioned_L3.3_70B')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/synthetic_data_qna_fulltext_conditioned_L3.3_70B/commit/d79ec6c146f50c845c5c3fa6ba8a2bbb36b27088', commit_message='Upload dataset', commit_description='', oid='d79ec6c146f50c845c5c3fa6ba8a2bbb36b27088', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/synthetic_data_qna_fulltext_conditioned_L3.3_70B', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/synthetic_data_qna_fulltext_conditioned_L3.3_70B'), pr_revision=None, pr_num=None)