In [1]:
from datasets import load_dataset
from sklearn.cluster import KMeans
from vllm import LLM

import json
import numpy as np

In [2]:
personas_ds = load_dataset('amang1802/personas_sample_405B')['train']

In [3]:
model_id = "Alibaba-NLP/gte-Qwen2-7B-instruct"

In [4]:
instruction = "Identify the name, profession and personality of the person described."

In [5]:
llm = LLM(model=model_id, trust_remote_code=True, max_model_len=4096, hf_overrides={"is_causal": False})

INFO 12-24 08:46:00 config.py:274] Overriding HF config with {'is_causal': False}
INFO 12-24 08:46:00 config.py:2167] Downcasting torch.float32 to torch.float16.
INFO 12-24 08:46:05 config.py:478] This model supports multiple tasks: {'generate', 'score', 'reward', 'embed', 'classify'}. Defaulting to 'embed'.
INFO 12-24 08:46:05 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='Alibaba-NLP/gte-Qwen2-7B-instruct', speculative_config=None, tokenizer='Alibaba-NLP/gte-Qwen2-7B-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observab

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


INFO 12-24 08:46:07 selector.py:120] Using Flash Attention backend.
INFO 12-24 08:46:08 model_runner.py:1092] Starting to load model Alibaba-NLP/gte-Qwen2-7B-instruct...
INFO 12-24 08:46:09 weight_utils.py:243] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]


INFO 12-24 08:46:18 model_runner.py:1097] Loading model weights took 13.2529 GB


In [6]:
def compute_embedding(personas):
    instruct_chunks = ["Instruct: " + instruction + "\nQuery:\n" + json.dumps(pjson, indent=2) for pjson in personas]
    outputs = llm.embed(instruct_chunks)

    return {"embedding": [output.outputs.embedding for output in outputs]}

In [7]:
embed_ds = personas_ds.map(compute_embedding, input_columns=['persona'], batched=True, batch_size=1024)



Map:   0%|          | 0/2002 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/1024 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/1024 [00:01<20:25,  1.20s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   9% 96/1024 [00:01<00:10, 92.39it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  12% 127/1024 [00:02<00:13, 64.37it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  16% 159/1024 [00:03<00:16, 53.11it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  19% 190/1024 [00:03<00:17, 48.59it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  22% 221/1024 [00:04<00:17, 45.91it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  25% 253/1024 [00:05<00:17, 44.08it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  28% 285/1024 [00:06<00:17, 42.97it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  31

In [8]:
embeds = np.array(embed_ds['embedding'])

In [9]:
n_clusters = 25

In [10]:
kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(embeds)

In [11]:
uniq_clusters = np.unique(clusters)
uniq_clusters

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24], dtype=int32)

In [12]:
centroid_ids = []
for idk in uniq_clusters:
    cluster_points = embeds[clusters == idk, :]
    centroid = cluster_points.mean(axis=0)
    cluster_centroid_dist = np.linalg.norm(embeds - centroid, axis=1)
    nearest_index = np.argmin(cluster_centroid_dist)
    centroid_ids.append(nearest_index)

In [13]:
assert len(centroid_ids) == len(uniq_clusters)

In [14]:
def is_cluster_centroid(idx):
    return {"is_cluster_centroid": idx in centroid_ids}

In [15]:
centroid_ds = embed_ds.map(lambda _, idx: is_cluster_centroid(idx), with_indices=True)

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

In [16]:
centroid_ds.push_to_hub('amang1802/personas_sample_405B')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/personas_sample_405B/commit/c8d43a76a3a2de864372acbb1ff21f61609b2ce4', commit_message='Upload dataset', commit_description='', oid='c8d43a76a3a2de864372acbb1ff21f61609b2ce4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/personas_sample_405B', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/personas_sample_405B'), pr_revision=None, pr_num=None)