In [None]:
from datasets import load_dataset, DatasetDict
from sklearn.cluster import KMeans
from vllm import LLM

import json
import numpy as np
import torch

In [3]:
ds_id = 'amang1802/synthetic_data_unconditioned_L3.1_70B'
column = 'synthetic_content'
dedup_ratio = 2

In [4]:
content_ds = load_dataset(ds_id)['train']

README.md:   0%|          | 0.00/373 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/94.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10244 [00:00<?, ? examples/s]

In [4]:
model_id = "Alibaba-NLP/gte-Qwen2-7B-instruct"

In [5]:
#instruction = "Identify the name, profession and personality of the person described."
instruction = "Identify the factual information, named entities, concepts and themes from the knowledge content."

In [6]:
llm = LLM(model=model_id, max_model_len=4096, hf_overrides={"is_causal": False}, task="embed")

INFO 12-28 18:49:32 config.py:280] Overriding HF config with {'is_causal': False}
INFO 12-28 18:49:32 config.py:2272] Downcasting torch.float32 to torch.float16.
INFO 12-28 18:49:38 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='Alibaba-NLP/gte-Qwen2-7B-instruct', speculative_config=None, tokenizer='Alibaba-NLP/gte-Qwen2-7B-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, serv

Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]


INFO 12-28 18:49:46 model_runner.py:1099] Loading model weights took 13.2529 GB


In [11]:
def compute_embedding(texts):
    tokenizer = llm.get_tokenizer()
    instruct_prompts = ["Instruct: " + instruction + "\nQuery:\n" + text for text in texts]
    prompt_token_ids = [{"prompt_token_ids": tokenizer.encode(prompt)[:4096]} for prompt in instruct_prompts]
    outputs = llm.embed(prompt_token_ids)

    return {"embedding": [output.outputs.embedding for output in outputs]}

In [12]:
embed_ds = content_ds.map(compute_embedding, input_columns=[column], batched=True, batch_size=1024)

Map:   0%|          | 0/10244 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/1024 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/1024 [00:02<36:50,  2.16s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   2% 22/1024 [00:04<02:42,  6.16it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   4% 45/1024 [00:06<01:52,  8.72it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   6% 66/1024 [00:08<01:42,  9.38it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   9% 89/1024 [00:10<01:31, 10.24it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  12% 118/1024 [00:11<01:16, 11.86it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  14% 146/1024 [00:13<01:08, 12.87it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  15% 158/1024 [00:15<01:21, 10.59it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:  18% 1

In [5]:
embeds = np.array(content_ds['embedding'])

In [7]:
n_clusters = content_ds.num_rows // dedup_ratio

In [8]:
# TODO: Move to faiss
kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(embeds)

In [9]:
clusters_ds = embed_ds.map(lambda _, idx: {"cluster_id": clusters[idx]}, with_indices=True)
clusters_ds.push_to_hub(ds_id)

In [10]:
uniq_clusters = np.unique(clusters)
uniq_clusters

array([   0,    1,    2, ..., 5119, 5120, 5121], dtype=int32)

In [21]:
# TODO: Move to faiss
centroid_ids = []
for idk in uniq_clusters:
    cluster_points = embeds[clusters == idk, :]
    centroid = cluster_points.mean(axis=0)
    cluster_centroid_dist = np.linalg.norm(embeds - centroid, axis=1)
    nearest_index = np.argmin(cluster_centroid_dist)
    centroid_ids.append(nearest_index)

In [22]:
assert len(centroid_ids) == len(uniq_clusters)

In [25]:
def is_cluster_centroid(idx):
    return idx in centroid_ids

In [54]:
centroid_ds = content_ds.filter(lambda _, idx: is_cluster_centroid(idx), with_indices=True)
not_centroid_ds = content_ds.filter(lambda _, idx: not is_cluster_centroid(idx), with_indices=True)

Filter:   0%|          | 0/10244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10244 [00:00<?, ? examples/s]

In [55]:
centroid_ds.num_rows, not_centroid_ds.num_rows

(5122, 5122)

In [56]:
deduped_ds = DatasetDict({"train": centroid_ds, "test": not_centroid_ds})

In [57]:
deduped_ds

DatasetDict({
    train: Dataset({
        features: ['synthetic_content', 'cluster_id'],
        num_rows: 5122
    })
    test: Dataset({
        features: ['synthetic_content', 'cluster_id'],
        num_rows: 5122
    })
})

In [58]:
deduped_ds.push_to_hub(ds_id + "_deduped")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/441 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/synthetic_data_unconditioned_L3.1_70B_deduped/commit/e08b5da71a41d641b9a5c4a0cec02d1ef90228e1', commit_message='Upload dataset', commit_description='', oid='e08b5da71a41d641b9a5c4a0cec02d1ef90228e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/synthetic_data_unconditioned_L3.1_70B_deduped', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/synthetic_data_unconditioned_L3.1_70B_deduped'), pr_revision=None, pr_num=None)

In [104]:
np.where(np.bincount(clusters) > 20)

(array([  32,   76, 1207, 3563]),)

In [105]:
for i in np.where(clusters == 3563)[0]:
    print(content_ds[int(i)]['synthetic_content'])

How to use Artificial Intelligence to automate business processes

Introduction: Artificial Intelligence (AI) is becoming increasingly popular in various industries as it helps automate repetitive tasks and improve efficiency. In this article, we will explore how businesses can use AI to automate their processes and increase productivity.

## What is Artificial Intelligence?

Artificial Intelligence refers to the simulation of human intelligence in machines that are programmed to think and learn like humans. It involves the development of algorithms and models that enable computers to perform tasks that would otherwise require human intelligence.

## How can AI help automate business processes?

AI can be used to automate a wide range of business processes, including:

1. Data analysis: AI can analyze large amounts of data quickly and accurately, identifying patterns and insights that humans may miss.

2. Customer service: AI-powered chatbots can provide instant responses to customer q