In [1]:
!pip install llm

Collecting llm
  Downloading llm-0.19.1-py3-none-any.whl.metadata (6.5 kB)
Collecting click-default-group>=1.2.3 (from llm)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting sqlite-utils>=3.37 (from llm)
  Downloading sqlite_utils-3.38-py3-none-any.whl.metadata (7.5 kB)
Collecting sqlite-migrate>=0.1a2 (from llm)
  Downloading sqlite_migrate-0.1b0-py3-none-any.whl.metadata (5.4 kB)
Collecting python-ulid (from llm)
  Downloading python_ulid-3.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting puremagic (from llm)
  Downloading puremagic-1.28-py3-none-any.whl.metadata (5.8 kB)
Collecting sqlite-fts4 (from sqlite-utils>=3.37->llm)
  Downloading sqlite_fts4-1.0.3-py3-none-any.whl.metadata (6.6 kB)
Downloading llm-0.19.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading click_default_group-1.2.4-py2.py3-none-any.whl (4.1 kB)
Downloading sqlit

In [2]:
import click
import json
import llm
import numpy as np
import sklearn.cluster
import sqlite_utils
import textwrap

DEFAULT_SUMMARY_PROMPT = """
Short, concise title for this cluster of related documents.
""".strip()


@llm.hookimpl
def register_commands(cli):
    @cli.command()
    @click.argument("collection")
    @click.argument("n", type=int)
    @click.option(
        "--truncate",
        type=int,
        default=100,
        help="Truncate content to this many characters - 0 for no truncation",
    )
    @click.option(
        "-d",
        "--database",
        type=click.Path(
            file_okay=True, allow_dash=False, dir_okay=False, writable=True
        ),
        envvar="LLM_EMBEDDINGS_DB",
        help="SQLite database file containing embeddings",
    )
    @click.option(
        "--summary", is_flag=True, help="Generate summary title for each cluster"
    )
    @click.option("-m", "--model", help="LLM model to use for the summary")
    @click.option("--prompt", help="Custom prompt to use for the summary")
    def cluster(collection, n, truncate, database, summary, model, prompt):
        """
        Generate clusters from embeddings in a collection

        Example usage, to create 10 clusters:

        \b
            llm cluster my_collection 10

        Outputs a JSON array of {"id": "cluster_id", "items": [list of items]}

        Pass --summary to generate a summary for each cluster, using the default
        language model or the model you specify with --model.
        """
        from llm.cli import get_default_model, get_key

        clustering_model = sklearn.cluster.MiniBatchKMeans(n_clusters=n, n_init="auto")
        if database:
            db = sqlite_utils.Database(database)
        else:
            db = sqlite_utils.Database(llm.user_dir() / "embeddings.db")
        rows = [
            (row[0], llm.decode(row[1]), row[2])
            for row in db.execute(
                """
            select id, embedding, content from embeddings
            where collection_id = (
                select id from collections where name = ?
            )
        """,
                [collection],
            ).fetchall()
        ]
        to_cluster = np.array([item[1] for item in rows])
        clustering_model.fit(to_cluster)
        assignments = clustering_model.labels_

        def truncate_text(text):
            if not text:
                return None
            if truncate > 0:
                return text[:truncate]
            else:
                return text

        # Each one corresponds to an ID
        clusters = {}
        for (id, _, content), cluster in zip(rows, assignments):
            clusters.setdefault(str(cluster), []).append(
                {"id": str(id), "content": truncate_text(content)}
            )
        # Re-arrange into a list
        output_clusters = [{"id": k, "items": v} for k, v in clusters.items()]

        # Do we need to generate summaries?
        if summary:
            model = llm.get_model(model or get_default_model())
            if model.needs_key:
                model.key = get_key("", model.needs_key, model.key_env_var)
            prompt = prompt or DEFAULT_SUMMARY_PROMPT
            click.echo("[")
            for cluster, is_last in zip(
                output_clusters, [False] * (len(output_clusters) - 1) + [True]
            ):
                click.echo("  {")
                click.echo('    "id": {},'.format(json.dumps(cluster["id"])))
                click.echo(
                    '    "items": '
                    + textwrap.indent(
                        json.dumps(cluster["items"], indent=2), "    "
                    ).lstrip()
                    + ","
                )
                prompt_content = "\n".join(
                    [item["content"] for item in cluster["items"] if item["content"]]
                )
                if prompt_content.strip():
                    summary = model.prompt(
                        prompt_content,
                        system=prompt,
                    ).text()
                else:
                    summary = None
                click.echo('    "summary": {}'.format(json.dumps(summary)))
                click.echo("  }" + ("," if not is_last else ""))
            click.echo("]")
        else:
            click.echo(json.dumps(output_clusters, indent=4))

In [4]:
import numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import csv

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Read the CSV file and extract the text to be embedded
# Create a sample dataset of documents
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Natural language processing deals with the interaction between computers and human language.",
    "Deep learning uses neural networks with multiple layers.",
    "Reinforcement learning is learning what to do to maximize a reward.",
    "Computer vision is the field of AI that trains computers to interpret visual information.",
    "Clustering is an unsupervised learning technique.",
    "Classification is a supervised learning task.",
    "Regression predicts continuous values.",
    "Neural networks are inspired by the human brain.",
    "Support vector machines are used for classification and regression tasks."
]

# Generate embeddings
embeddings = model.encode(documents)

# Perform K-means clustering
num_clusters = 3  # Adjust as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# Print the clustering results
for i, (doc, label) in enumerate(zip(documents, cluster_labels)):
    print(f"Document {i}: Cluster {label}")
    print(f"Content: {doc[:100]}...")  # Print first 100 characters
    print()

# Optional: Generate summaries for each cluster
# This part depends on how you want to summarize the clusters
# You might need to implement a custom summarization me

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Document 0: Cluster 2
Content: Machine learning is a subset of artificial intelligence....

Document 1: Cluster 1
Content: Natural language processing deals with the interaction between computers and human language....

Document 2: Cluster 2
Content: Deep learning uses neural networks with multiple layers....

Document 3: Cluster 0
Content: Reinforcement learning is learning what to do to maximize a reward....

Document 4: Cluster 2
Content: Computer vision is the field of AI that trains computers to interpret visual information....

Document 5: Cluster 1
Content: Clustering is an unsupervised learning technique....

Document 6: Cluster 1
Content: Classification is a supervised learning task....

Document 7: Cluster 1
Content: Regression predicts continuous values....

Document 8: Cluster 2
Content: Neural networks are inspired by the human brain....

Document 9: Cluster 1
Content: Support vector machines are used for classification and regression tasks....



In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
for i, doc in enumerate(documents):
    print(f"Document {i}: {doc}")

Document 0: Machine learning is a subset of artificial intelligence.
Document 1: Natural language processing deals with the interaction between computers and human language.
Document 2: Deep learning uses neural networks with multiple layers.
Document 3: Reinforcement learning is learning what to do to maximize a reward.
Document 4: Computer vision is the field of AI that trains computers to interpret visual information.
Document 5: Clustering is an unsupervised learning technique.
Document 6: Classification is a supervised learning task.
Document 7: Regression predicts continuous values.
Document 8: Neural networks are inspired by the human brain.
Document 9: Support vector machines are used for classification and regression tasks.


In [8]:
embeddings = model.encode(documents)
print(f"Shape of embeddings: {embeddings.shape}")

Shape of embeddings: (10, 384)


In [9]:
embeddings = []
for i, doc in enumerate(documents):
    try:
        embedding = model.encode(doc)
        embeddings.append(embedding)
    except Exception as e:
        print(f"Error embedding document {i}: {str(e)}")

embeddings_array = np.array(embeddings)

In [10]:
batch_size = 10
all_embeddings = []
for i in range(0, len(documents), batch_size):
    batch = documents[i:i+batch_size]
    batch_embeddings = model.encode(batch)
    all_embeddings.extend(batch_embeddings)

embeddings_array = np.array(all_embeddings)

In [11]:
import numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import csv

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Read the CSV file and extract the text to be embedded
# Create a sample dataset of documents
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Natural language processing deals with the interaction between computers and human language.",
    "Deep learning uses neural networks with multiple layers.",
    "Reinforcement learning is learning what to do to maximize a reward.",
    "Computer vision is the field of AI that trains computers to interpret visual information.",
    "Clustering is an unsupervised learning technique.",
    "Classification is a supervised learning task.",
    "Regression predicts continuous values.",
    "Neural networks are inspired by the human brain.",
    "Support vector machines are used for classification and regression tasks."
]

# Generate embeddings
embeddings = model.encode(documents)

# Perform K-means clustering
num_clusters = 3  # Adjust as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# Print the clustering results
for i, (doc, label) in enumerate(zip(documents, cluster_labels)):
    print(f"Document {i}: Cluster {label}")
    print(f"Content: {doc[:100]}...")  # Print first 100 characters
    print()

# Optional: Generate summaries for each cluster
# This part depends on how you want to summarize the clusters
# You might need to implement a custom summarization me

Document 0: Cluster 2
Content: Machine learning is a subset of artificial intelligence....

Document 1: Cluster 1
Content: Natural language processing deals with the interaction between computers and human language....

Document 2: Cluster 2
Content: Deep learning uses neural networks with multiple layers....

Document 3: Cluster 0
Content: Reinforcement learning is learning what to do to maximize a reward....

Document 4: Cluster 2
Content: Computer vision is the field of AI that trains computers to interpret visual information....

Document 5: Cluster 1
Content: Clustering is an unsupervised learning technique....

Document 6: Cluster 1
Content: Classification is a supervised learning task....

Document 7: Cluster 1
Content: Regression predicts continuous values....

Document 8: Cluster 2
Content: Neural networks are inspired by the human brain....

Document 9: Cluster 1
Content: Support vector machines are used for classification and regression tasks....

