In [1]:
from datasets import load_dataset, DatasetDict
from sklearn.cluster import KMeans
from vllm import LLM

import json
import numpy as np
import torch

In [2]:
column = 'synthetic_content'
dedup_ratio = 2

In [3]:
model_id = "Alibaba-NLP/gte-Qwen2-7B-instruct"

In [4]:
instruction = "Identify the factual information, named entities, concepts and themes from the knowledge content."

In [5]:
llm = LLM(model=model_id, max_model_len=4096, hf_overrides={"is_causal": False}, task="embed")

INFO 12-29 23:09:29 config.py:280] Overriding HF config with {'is_causal': False}
INFO 12-29 23:09:29 config.py:2272] Downcasting torch.float32 to torch.float16.
INFO 12-29 23:09:34 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='Alibaba-NLP/gte-Qwen2-7B-instruct', speculative_config=None, tokenizer='Alibaba-NLP/gte-Qwen2-7B-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, serv

Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]


INFO 12-29 23:09:41 model_runner.py:1099] Loading model weights took 13.2529 GB


In [6]:
def compute_embedding(texts):
    tokenizer = llm.get_tokenizer()
    instruct_prompts = ["Instruct: " + instruction + "\nQuery:\n" + text for text in texts]
    prompt_token_ids = [{"prompt_token_ids": tokenizer.encode(prompt)[:4096]} for prompt in instruct_prompts]
    outputs = llm.embed(prompt_token_ids)

    return {"embedding": [output.outputs.embedding for output in outputs]}

In [7]:
def embed(content_ds):
    embed_ds = content_ds.map(compute_embedding, input_columns=[column], batched=True, batch_size=1024)
    embeds = np.array(embed_ds['embedding'])

    return embed_ds, embeds

In [8]:
# TODO: use faiss
def cluster(ds_id, embed_ds, embeds):
    n_clusters = embed_ds.num_rows // dedup_ratio
    print(f"Kmeans with n_clusters={n_clusters}")

    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(embeds)

    clusters_ds = embed_ds.map(lambda _, idx: {"cluster_id": clusters[idx]}, with_indices=True)
    print(f"{clusters_ds!r}")
    clusters_ds.push_to_hub(ds_id)

    return clusters

In [9]:
# TODO: use faiss
def dedup(ds_id, content_ds, embeds, clusters):
    uniq_clusters = np.unique(clusters)
    uniq_clusters

    centroid_ids = []
    for idk in uniq_clusters:
        cluster_points = embeds[clusters == idk, :]
        centroid = cluster_points.mean(axis=0)
        cluster_centroid_dist = np.linalg.norm(embeds - centroid, axis=1)
        nearest_index = np.argmin(cluster_centroid_dist)
        centroid_ids.append(nearest_index)

    assert len(centroid_ids) == len(uniq_clusters)

    centroid_ds = content_ds.filter(lambda _, idx: idx in centroid_ids, with_indices=True)
    not_centroid_ds = content_ds.filter(lambda _, idx: not idx in centroid_ids, with_indices=True)

    deduped_ds = DatasetDict({"train": centroid_ds, "test": not_centroid_ds})
    print(f"{deduped_ds!r}")
    deduped_ds.push_to_hub(ds_id + "_deduped")

In [10]:
def top_clusters(content_ds, clusters):
    top_k_cluster_ids = np.argsort(np.bincount(clusters))[-3:]
 
    for c, tc_idx in enumerate(top_k_cluster_ids):
        sample_idx = np.where(clusters == tc_idx)[0][0]
        print(f"Sample from cluster: {c}" + "\n\n")
        print(content_ds[int(sample_idx)]['synthetic_content'])
        print("\n-------\n\n")

In [11]:
def run(ds_id):
    content_ds = load_dataset(ds_id)['train']
    embed_ds, embeds = embed(content_ds)
    clusters = cluster(ds_id, embed_ds, embeds)

    dedup(ds_id, content_ds, embeds, clusters)
    top_clusters(content_ds, clusters)

In [12]:
ds_ids = [
    'amang1802/synthetic_data_unconditioned_L3.1_405B_Instruct',
    'amang1802/synthetic_data_topic_conditioned_L3.3_70B',
    'amang1802/synthetic_data_prefix_conditioned_L3.3_70B',
    'amang1802/synthetic_data_fulltext_conditioned_L3.3_70B',]

for ds_id in ds_ids:
    run(ds_id)



Map:   0%|          | 0/10244 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/1024 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/1024 [00:02<36:50,  2.16s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   4% 38/1024 [00:04<01:29, 11.00it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   8% 77/1024 [00:06<01:02, 15.04it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  11% 116/1024 [00:07<00:50, 17.82it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  15% 150/1024 [00:09<00:49, 17.71it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  18% 185/1024 [00:11<00:46, 17.90it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  22% 224/1024 [00:13<00:43, 18.58it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  25% 260/1024 [00:15<00:41, 18.59it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  29%

Kmeans with n_clusters=5122


Map:   0%|          | 0/10244 [00:00<?, ? examples/s]

Dataset({
    features: ['synthetic_content', 'embedding', 'cluster_id'],
    num_rows: 10244
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



Filter:   0%|          | 0/10244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10244 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['synthetic_content'],
        num_rows: 5122
    })
    test: Dataset({
        features: ['synthetic_content'],
        num_rows: 5122
    })
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample from cluster: 0


**Learning and Development: The Science of Acquisition**

Introduction
------------

Learning and development are essential components of human growth and progress. The process of acquiring new skills, knowledge, and behaviors is a complex and multi-faceted one, influenced by a variety of factors including genetics, environment, and individual experiences. This article will explore the science of learning and development, examining the key concepts, theories, and research findings that underpin our understanding of this critical aspect of human development.

**Theories of Learning**

There are several major theories of learning, each attempting to explain the process by which we acquire new information and skills. Some of the most influential theories include:

*   **Behavioral Theory**: This theory proposes that learning occurs through the association of stimuli with responses, and that behavior is shaped by reinforcement and punishment.
*   **Cognitive Theory

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/1024 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/1024 [00:01<32:43,  1.92s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   6% 60/1024 [00:03<00:52, 18.33it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  11% 115/1024 [00:05<00:39, 22.99it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  17% 172/1024 [00:07<00:33, 25.56it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  23% 231/1024 [00:09<00:28, 27.44it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  28% 289/1024 [00:11<00:25, 28.42it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  34% 346/1024 [00:13<00:23, 28.83it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  39% 403/1024 [00:15<00:21, 29.03it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  45

Kmeans with n_clusters=5120


Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'url', 'title', 'text', 'synthetic_content', 'judgement', 'accuracy_score', 'embedding', 'cluster_id'],
    num_rows: 10240
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text', 'synthetic_content', 'judgement', 'accuracy_score'],
        num_rows: 5120
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text', 'synthetic_content', 'judgement', 'accuracy_score'],
        num_rows: 5120
    })
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample from cluster: 0


Diarra Sylla is a French professional footballer who plays as a striker for Swiss club Servette FC. Born on November 30, 1990, in Aubervilliers, France, Sylla began his football career at a young age, joining the youth academy of French club Le Mans FC. He quickly rose through the ranks, making his professional debut for Le Mans in 2009.

Sylla's early career was marked by loan spells at various French clubs, including Stade de Reims and Clermont Foot, before he eventually joined Belgian club KVC Westerlo in 2012. It was during his time at Westerlo that Sylla began to establish himself as a prolific goal-scorer, netting 15 goals in 23 appearances for the club.

In 2013, Sylla joined Belgian Pro League side KSC Lokeren, where he continued to impress with his goal-scoring form. Over the course of two seasons, he scored 15 goals in 32 appearances, earning himself a move to Belgian giants Anderlecht in 2015.

Sylla's time at Anderlecht was marked by success, as he 

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/1024 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/1024 [00:01<32:24,  1.90s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   7% 69/1024 [00:03<00:44, 21.24it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  13% 132/1024 [00:05<00:33, 26.59it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  19% 197/1024 [00:07<00:28, 29.40it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  26% 266/1024 [00:09<00:23, 31.69it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  33% 336/1024 [00:11<00:20, 33.25it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  40% 406/1024 [00:13<00:17, 34.36it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  46% 474/1024 [00:15<00:15, 34.72it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  53

Kmeans with n_clusters=5120


Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'url', 'title', 'text', 'prefix', 'synthetic_content', 'judgement', 'accuracy_score', 'embedding', 'cluster_id'],
    num_rows: 10240
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text', 'prefix', 'synthetic_content', 'judgement', 'accuracy_score'],
        num_rows: 5120
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text', 'prefix', 'synthetic_content', 'judgement', 'accuracy_score'],
        num_rows: 5120
    })
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample from cluster: 0


Deh-e Gowd, also known as Allahabad, Deh-e Gowda, Deh Goo, Deh-i-Gav, and Deh Ka'u, is a small village located in the Jazmurian Rural District, Jazmurian District, Rudbar-e Jonubi County, Kerman Province, Iran. As of the 2006 census, the village had a population of 413 people, comprising 92 families. The village is situated in a rural area, and its economy is likely based on agriculture and small-scale farming. The surrounding landscape is characterized by arid desert terrain, with limited vegetation and water resources. The village's remote location and limited access to modern amenities make it a challenging place to live, but the residents of Deh-e Gowd have adapted to the harsh environment and developed a unique culture and way of life. The village is part of the larger Rudbar-e Jonubi County, which is known for its rich history, cultural heritage, and natural beauty. The county is home to several other villages and towns, each with its own distinct charact

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/1024 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/1024 [00:01<33:07,  1.94s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   4% 45/1024 [00:03<01:11, 13.62it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   9% 91/1024 [00:05<00:50, 18.42it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  14% 144/1024 [00:07<00:39, 22.03it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  19% 196/1024 [00:09<00:34, 23.77it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  25% 252/1024 [00:11<00:30, 25.48it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  30% 308/1024 [00:13<00:26, 26.54it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  35% 357/1024 [00:15<00:25, 26.19it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  40%

Kmeans with n_clusters=5120


Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'url', 'title', 'text', 'synthetic_content', 'judgement', 'accuracy_score', 'embedding', 'cluster_id'],
    num_rows: 10240
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text', 'synthetic_content', 'judgement', 'accuracy_score'],
        num_rows: 5120
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text', 'synthetic_content', 'judgement', 'accuracy_score'],
        num_rows: 5120
    })
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample from cluster: 0


The name Laura Suarez or Suárez is associated with several notable individuals across various fields, each with their own unique achievements and contributions. One such individual is the Brazilian singer Laura Suarez, who was born in 1909 and passed away in 1990, leaving behind a legacy in the music industry. In the realm of sports, Laura Suárez, a Puerto Rican women's international footballer, has been making waves since her birth in 1992, showcasing her skills on the field. In the world of politics, Laura Margarita Suárez, a Mexican politician born in 1953, has been actively involved in shaping the country's governance and policies. Additionally, there is also a Cuban volleyball player by the name of Laura Suárez, who has represented her country in various international competitions, demonstrating her prowess in the sport. Each of these individuals, despite sharing a similar name, has carved out their own distinct path and achieved recognition in their respe