In [6]:
from datasets import load_dataset, DatasetDict
from sklearn.cluster import KMeans
from vllm import LLM

import json
import numpy as np
import torch

In [7]:
column = 'synthetic_content'
dedup_ratio = 2

In [8]:
model_id = "Alibaba-NLP/gte-Qwen2-7B-instruct"

In [9]:
instruction = "Identify the factual information, named entities, concepts and themes from the knowledge content."

In [10]:
llm = LLM(model=model_id, max_model_len=4096, hf_overrides={"is_causal": False}, task="embed")

config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

INFO 01-06 06:59:39 config.py:274] Overriding HF config with {'is_causal': False}


sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

INFO 01-06 06:59:39 config.py:2167] Downcasting torch.float32 to torch.float16.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

INFO 01-06 06:59:44 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='Alibaba-NLP/gte-Qwen2-7B-instruct', speculative_config=None, tokenizer='Alibaba-NLP/gte-Qwen2-7B-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Alibaba-NLP/gte-Qwen2-7B-instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

INFO 01-06 06:59:47 selector.py:120] Using Flash Attention backend.
INFO 01-06 06:59:49 model_runner.py:1092] Starting to load model Alibaba-NLP/gte-Qwen2-7B-instruct...
INFO 01-06 06:59:49 weight_utils.py:243] Using model weights format ['*.safetensors']


model-00001-of-00007.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/4.78G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]


INFO 01-06 07:00:31 model_runner.py:1097] Loading model weights took 13.2529 GB


In [11]:
def compute_embedding(texts):
    tokenizer = llm.get_tokenizer()
    instruct_prompts = ["Instruct: " + instruction + "\nQuery:\n" + text for text in texts]
    prompt_token_ids = [{"prompt_token_ids": tokenizer.encode(prompt)[:4096]} for prompt in instruct_prompts]
    outputs = llm.embed(prompt_token_ids)

    return {"embedding": [output.outputs.embedding for output in outputs]}

In [12]:
def embed(content_ds):
    embed_ds = content_ds.map(compute_embedding, input_columns=[column], batched=True, batch_size=1024)
    embeds = np.array(embed_ds['embedding'])

    return embed_ds, embeds

In [13]:
# TODO: use faiss
def cluster(ds_id, embed_ds, embeds):
    n_clusters = embed_ds.num_rows // dedup_ratio
    print(f"Kmeans with n_clusters={n_clusters}")

    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(embeds)

    clusters_ds = embed_ds.map(lambda _, idx: {"cluster_id": clusters[idx]}, with_indices=True)
    print(f"{clusters_ds!r}")
    clusters_ds.push_to_hub(ds_id)

    return clusters

In [14]:
# TODO: use faiss
def dedup(ds_id, content_ds, embeds, clusters):
    uniq_clusters = np.unique(clusters)
    uniq_clusters

    centroid_ids = []
    for idk in uniq_clusters:
        cluster_points = embeds[clusters == idk, :]
        centroid = cluster_points.mean(axis=0)
        cluster_centroid_dist = np.linalg.norm(embeds - centroid, axis=1)
        nearest_index = np.argmin(cluster_centroid_dist)
        centroid_ids.append(nearest_index)

    assert len(centroid_ids) == len(uniq_clusters)

    centroid_ds = content_ds.filter(lambda _, idx: idx in centroid_ids, with_indices=True)
    not_centroid_ds = content_ds.filter(lambda _, idx: not idx in centroid_ids, with_indices=True)

    deduped_ds = DatasetDict({"train": centroid_ds, "test": not_centroid_ds})
    print(f"{deduped_ds!r}")
    deduped_ds.push_to_hub(ds_id + "_deduped")

In [15]:
def top_clusters(content_ds, clusters):
    top_k_cluster_ids = np.argsort(np.bincount(clusters))[-3:]
 
    for c, tc_idx in enumerate(top_k_cluster_ids):
        sample_idx = np.where(clusters == tc_idx)[0][0]
        print(f"Sample from cluster: {c}" + "\n\n")
        print(content_ds[int(sample_idx)]['synthetic_content'])
        print("\n-------\n\n")

In [16]:
def run(ds_id):
    content_ds = load_dataset(ds_id)['train']
    embed_ds, embeds = embed(content_ds)
    clusters = cluster(ds_id, embed_ds, embeds)

    dedup(ds_id, content_ds, embeds, clusters)
    top_clusters(content_ds, clusters)

In [17]:
ds_ids = [
    # 'amang1802/synthetic_data_unconditioned_L3.1_405B_Instruct',
    # 'amang1802/synthetic_data_topic_conditioned_L3.3_70B',
    # 'amang1802/synthetic_data_prefix_conditioned_L3.3_70B',
    # 'amang1802/synthetic_data_fulltext_conditioned_L3.3_70B',
    'amang1802/synthetic_data_qna_fulltext_conditioned_L3.3_70B',
]

for ds_id in ds_ids:
    run(ds_id)



Map:   0%|          | 0/10240 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/1024 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/1024 [00:01<19:54,  1.17s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  19% 191/1024 [00:01<00:04, 187.11it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  26% 262/1024 [00:02<00:05, 136.53it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  32% 331/1024 [00:02<00:05, 116.89it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  39% 398/1024 [00:03<00:06, 103.07it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  45% 462/1024 [00:04<00:06, 93.61it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] 
[Acessed prompts:  52% 528/1024 [00:05<00:05, 91.27it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  58% 596/1024 [00:06<00:04, 90.16it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompt

Kmeans with n_clusters=5120


Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'url', 'title', 'text', 'synthetic_content', 'embedding', 'cluster_id'],
    num_rows: 10240
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text', 'synthetic_content'],
        num_rows: 5119
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text', 'synthetic_content'],
        num_rows: 5121
    })
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample from cluster: 0


**Title**
1561 in science

**Q&A**
Question: What significant event occurred in cartography and navigation in 1561?
Answer: Bartolomeu Velho produced a Carta General do Orbe for Sebastian of Portugal, and Richard Eden translated Martín Cortés de Albacar's Arte de navigar as The Arte of Navigation, which became the first manual of navigation in English.

Question: What medical publications were released in 1561?
Answer: Gabriele Falloppio published Observationes anatomicae in Venice, and Ambroise Paré published Anatomie universelle du corps humain and La méthode curative des playes et fractures de la test humaine in Paris.

Question: What epidemic occurred in Chile in 1561?
Answer: A smallpox epidemic occurred in Chile.

Question: Who were some notable births in the field of science and technology in 1561?
Answer: Notable births in 1561 included Thomas Fincke, a Danish mathematician; Francis Bacon, an English philosopher of science; Sanctorius, an Istrian physio