In [47]:
#!pip install lancedb

In [728]:
import lancedb
db = lancedb.connect("./lancedb")

In [726]:
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry

model_name = "plaguss/bge-base-argilla-sdk-matryoshka"
device = "mps"

model = get_registry().get("sentence-transformers").create(name=model_name, device=device)


In [2]:
from typing import Optional, Any
import os
from pathlib import Path
import tarfile
from dataclasses import dataclass

import lancedb
from huggingface_hub.file_download import hf_hub_download
from huggingface_hub import InferenceClient
from transformers import AutoTokenizer
import gradio as gr


def untar_file(source: Path) -> Path:
    """Untar and decompress files which have passed by `make_tarfile`.

    Args:
        source (Path): Path pointing to a .tag.gz file.

    Returns:
        filename (Path): The filename of the file decompressed.
    """
    new_filename = source.parent / source.stem.replace(".tar", "")
    with tarfile.open(source, "r:gz") as f:
        f.extractall(source.parent)
    return new_filename


@dataclass
class Settings:
    LANCEDB: str = "lancedb"
    LANCEDB_FILE_TAR: str = "lancedb.tar.gz"
    TOKEN: str = os.getenv("HF_API_TOKEN")
    LOCAL_DIR: Path = Path.home() / ".cache/argilla_sdk_docs_db"
    REPO_ID: str = "plaguss/argilla_sdk_docs_queries"
    TABLE_NAME: str = "docs"
    MODEL_NAME: str = "plaguss/bge-base-argilla-sdk-matryoshka"
    DEVICE: str = "mps"


settings = Settings()

from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

class Database:
    def __init__(self, settings: Settings):
        self.settings = settings
        self.table = self.get_table_from_db()

    def get_table_from_db(self) -> lancedb.table.LanceTable:
        lancedb_db_path = self.settings.LOCAL_DIR / self.settings.LANCEDB
        if not lancedb_db_path.exists():
            lancedb_download = Path(
                hf_hub_download(
                    self.settings.REPO_ID,
                    self.settings.LANCEDB_FILE_TAR,
                    repo_type="dataset",
                    token=self.settings.TOKEN,
                    local_dir=self.settings.LOCAL_DIR
                )
            )

            lancedb_db_path = untar_file(lancedb_download)

        db = lancedb.connect(str(lancedb_db_path))
        table_name = "docs"
        table = db.open_table(table_name)
        return table

    def retrieve_doc_chunks(self, query: str, limit: int = 12, hard_limit: int = 4) -> str:
        retrieved = (
            self.table
                .search(query)
                .metric("cosine")
                .limit(limit)
                .select(["text"])  # Just grab the chunk to use for context
                .to_list()
        )
        # We have repeated questions (up to 4) for a given chunk, so we may get repeated chunks.
        # Request more than necessary and filter them afterwards
        responses = []
        unique_responses = set()

        for item in retrieved:
            chunk = item["text"]
            if chunk not in unique_responses:
                unique_responses.add(chunk)
                responses.append(chunk)

        context = ""
        for i, item in enumerate(responses[:hard_limit]):
            if i > 0:
                context += "\n\n"
            context += f"- {item}"
        return context


database = Database(settings=settings)

In [8]:
model = get_registry().get("sentence-transformers").create(name=settings.MODEL_NAME, device=settings.DEVICE)

class Docs(LanceModel):
    query: str = model.SourceField()
    text: str = model.SourceField()
    vector: Vector(model.ndims()) = model.VectorField()




In [22]:
query = "How can I get the current user?"
embedded_query = model.generate_embeddings([query])

In [25]:
retrieved = (
    database.table
        #.search(query)
        .search(embedded_query[0])
        .metric("cosine")
        .limit(3)
        .select(["text"])  # Just grab the chunk to use for context
        .to_list()
)

In [26]:
retrieved

[{'text': 'python\nuser = client.users("my_username")\n\nThe current user of the rg.Argilla client can be accessed using the me attribute:\n\npython\nclient.me\n\nClass Reference\n\nrg.User\n\n::: argilla_sdk.users.User\n    options:\n        heading_level: 3',
  '_distance': 0.1881886124610901},
 {'text': 'python\nuser = client.users("my_username")\n\nThe current user of the rg.Argilla client can be accessed using the me attribute:\n\npython\nclient.me\n\nClass Reference\n\nrg.User\n\n::: argilla_sdk.users.User\n    options:\n        heading_level: 3',
  '_distance': 0.20238929986953735},
 {'text': 'Retrieve a user\n\nYou can retrieve an existing user from Argilla by accessing the users attribute on the Argilla class and passing the username as an argument.\n\n```python\nimport argilla_sdk as rg\n\nclient = rg.Argilla(api_url="", api_key="")\n\nretrieved_user = client.users("my_username")\n```',
  '_distance': 0.20401990413665771}]

In [7]:
database.table

LanceTable(connection=LanceDBConnection(/Users/agus/.cache/argilla_sdk_docs_db/lancedb), name="docs")

In [729]:
# We can add extra info, like the doc it pertains to

class Docs(LanceModel):
    query: str = model.SourceField()
    text: str = model.SourceField()
    vector: Vector(model.ndims()) = model.VectorField()

table_name = "docs"
table = db.create_table(table_name, schema=Docs)

In [730]:
from datasets import load_dataset

In [731]:
ds = load_dataset("plaguss/argilla_sdk_docs_queries", split="train")

In [740]:
import pandas as pd
import tqdm


In [742]:
batch_size = 50
for batch in tqdm.tqdm(ds.iter(batch_size), total=len(ds) // batch_size):
    embeddings = model.generate_embeddings(batch["positive"])
    df = pd.DataFrame.from_dict({"query": batch["positive"], "text": batch["anchor"], "vector": embeddings})
    table.add(df)


20it [00:05,  3.77it/s]                                                                                                                                                                                                                                              


In [845]:
query = "How can I create a feedback dataset in argilla?"
query = "How can I connect to an argilla server?"
query = "How can I create a dataset?"
query = "How can I get the current user?"

response = table.search(query).limit(12).select(["text"]).to_list()#.to_pydantic(Docs)
hard_limit = 3
unique_responses = set()
ctr = 0
for i, d in enumerate(response):
    chunk = d["text"]
    if chunk not in unique_responses:
        unique_responses.add(chunk)
        print("-----", i)
        print(chunk)
        ctr += 1
        if ctr == hard_limit:
            break


----- 0
python
user = client.users("my_username")

The current user of the rg.Argilla client can be accessed using the me attribute:

python
client.me

Class Reference

rg.User

::: argilla_sdk.users.User
    options:
        heading_level: 3
----- 2
Retrieve a user

You can retrieve an existing user from Argilla by accessing the users attribute on the Argilla class and passing the username as an argument.

```python
import argilla_sdk as rg

client = rg.Argilla(api_url="", api_key="")

retrieved_user = client.users("my_username")
```
----- 5
!!! info "Main Class"

Get current user

To ensure you're using the correct credentials for managing users, you can get the current user in Argilla using the me attribute of the Argilla class.

```python
import argilla_sdk as rg

client = rg.Argilla(api_url="", api_key="")

current_user = client.me
```


## Database

The database is a folder small enough that we can zip it and move it around, then load it and start making queries.

[ref](https://github.com/plaguss/talking-python/blob/main/src/talking_python/release.py) to compress/uncompress the db and pass it around.

In [772]:
import datetime as dt
import json
import os
import tarfile
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import tqdm
import inspect

import requests

#RELEASES_URL = r"https://github.com/plaguss/talking-python/releases"
#RELEASES_ENDPOINT = r"https://api.github.com/repos/plaguss/talking-python/releases"


DATE_FORMAT = "%Y%m%d-%H%M"

def make_tarfile(source: Path) -> Path:
    """Creates a tar file from a directory and compresses it
    using gzip.

    Args:
        source (Path): Path to a directory.

    Returns:
        path (Path): Path of the new generated file.

    Raises:
        FileNotFoundError: If the directory doesn't exists.
    """
    print(f"Creating tar file from path: {source}...")
    source = Path(source)
    if not source.is_dir():
        raise FileNotFoundError(source)
    with tarfile.open(str(source) + ".tar.gz", "w:gz") as tar:
        tar.add(str(source), arcname=source.name)
    print(f"File generated at: {str(source) + '.tar.gz'}")
    return Path(str(source) + ".tar.gz")

def untar_file(source: Path) -> Path:
    """Untar and decompress files which have passed by `make_tarfile`.

    Args:
        source (Path): Path pointing to a .tag.gz file.

    Returns:
        filename (Path): The filename of the file decompressed.
    """
    # It assumes the file ends with .tar.gz
    new_filename = source.parent / source.stem.replace(".tar", "")
    with tarfile.open(source, "r:gz") as f:
        f.extractall(source.parent)
    print(f"File decompressed: {new_filename}")
    return new_filename

In [758]:
lancedb_path = Path.cwd() / "lancedb"
lancedb_tar = make_tarfile(lancedb_path)


Creating tar file from path: /Users/agus/github_repos/argilla-io/distilabel-workbench/projects/argilla-sdk-bot/lancedb...
File generated at: /Users/agus/github_repos/argilla-io/distilabel-workbench/projects/argilla-sdk-bot/lancedb.tar.gz


In [759]:
!pip install huggingface_hub



In [762]:
from huggingface_hub import HfApi
from huggingface_hub.file_download import hf_hub_download
import os

In [770]:
lancedb_file = "lancedb.tar.gz"
token = os.getenv("HF_API_TOKEN")
local_dir = Path.home() / ".cache/argilla_sdk_docs_db"
repo_id = "plaguss/argilla_sdk_docs_queries"
lancedb_download = Path(
    hf_hub_download(
        repo_id,
        lancedb_file,
        repo_type="dataset",
        token=token,
        local_dir=local_dir
    )
)

In [775]:
lancedb_db_path = untar_file(lancedb_download)

File decompressed: /Users/agus/.cache/argilla_sdk_docs_db/lancedb


In [None]:
import lancedb

db = lancedb.connect(str(lancedb_db_path))
table_name = "docs"
table = db.open_table(table_name)


In [786]:

query = "How can I create a feedback dataset in argilla?"
query = "how can I delete users?"

retrieved = (
    table
    .search(query)
    .metric("cosine")
    .limit(3)
    .to_pydantic(Docs)
)
for d in retrieved:
    print("-----QUERY")
    print(d.query)
    print("======")
    print("DOC\n")
    print("======")
    print(d.text)

-----QUERY
Is it possible to remove a user from Argilla by utilizing the delete function on the User class?
DOC

Delete a user

You can delete an existing user from Argilla by calling the delete method on the User class.

```python
import argilla_sdk as rg

client = rg.Argilla(api_url="", api_key="")

user_to_delete = client.users('my_username')

deleted_user = user_to_delete.delete()
```
-----QUERY
How do I go about deleting a user from Argilla using the delete method provided by the User class?
DOC

Delete a user

You can delete an existing user from Argilla by calling the delete method on the User class.

```python
import argilla_sdk as rg

client = rg.Argilla(api_url="", api_key="")

user_to_delete = client.users('my_username')

deleted_user = user_to_delete.delete()
```
-----QUERY
Can I delete a user from Argilla using the delete method on the User class?
DOC

Delete a user

You can delete an existing user from Argilla by calling the delete method on the User class.

```python
imp

In [787]:

query = "how can I delete users?"

retrieved = (
    table
    .search(query)
    .limit(3)
    .to_pydantic(Docs)
)
for d in retrieved:
    print("======")
    print(d.text)

Delete a user

You can delete an existing user from Argilla by calling the delete method on the User class.

```python
import argilla_sdk as rg

client = rg.Argilla(api_url="", api_key="")

user_to_delete = client.users('my_username')

deleted_user = user_to_delete.delete()
```
Delete a user

You can delete an existing user from Argilla by calling the delete method on the User class.

```python
import argilla_sdk as rg

client = rg.Argilla(api_url="", api_key="")

user_to_delete = client.users('my_username')

deleted_user = user_to_delete.delete()
```
Delete a user

You can delete an existing user from Argilla by calling the delete method on the User class.

```python
import argilla_sdk as rg

client = rg.Argilla(api_url="", api_key="")

user_to_delete = client.users('my_username')

deleted_user = user_to_delete.delete()
```


In [778]:
retrieved

[Docs(query='How do I create a dataset in Argilla?', text='Create a dataset\n\nTo create a dataset, you can define it in the Dataset class and then call the create method that will send the dataset to the server so that it can be visualized in the UI. If the dataset does not appear in the UI, you may need to click the refresh button to update the view. For further configuration of the dataset, you can refer to the settings section.\n\nThe created dataset will be empty, to add the records refer to this how-to guide.\n\n```python\nimport argilla_sdk as rg', vector=FixedSizeList(dim=768)),
 Docs(query='I need help creating a dataset in Argilla', text='For a detail guide of the dataset creation and publication process, see the Dataset how to guide.\n\nRetrieving an existing Dataset\n\nTo retrieve an existing dataset, use client.datasets("my_dataset") instead.\n\npython\ndataset = client.datasets("my_dataset")\n\nClass Reference\n\nrg.Dataset\n\n::: argilla_sdk.datasets.Dataset\n    options

In [791]:
api_key = os.getenv("HF_API_TOKEN")

In [794]:
from huggingface_hub import (
    AsyncInferenceClient,
    InferenceClient,
    get_inference_endpoint,
)
from transformers import AutoTokenizer

model_id = tokenizer_id = "meta-llama/Meta-Llama-3-70B-Instruct"
client = InferenceClient()
status = client.get_model_status(model_id)
base_url = client._resolve_url(
    model=model_id, task="text-generation"
)
client = InferenceClient(
    model=base_url,
    token=os.getenv("HF_API_TOKEN")
)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
#aclient = AsyncInferenceClient(
#    model=base_url,
#    token=api_key,
#)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [805]:
client_kwargs = {
    "stream": True,
    "max_new_tokens": 12,
    "do_sample": False,
    "typical_p": None,
    "repetition_penalty": None,
    "temperature": 0.3,
    "top_p": None,
    "top_k": None,
    "stop_sequences": None,
    "seed": None,
}

query = "How do you make cheese?"
input = [
    [
        {"role": "system", "content": ""},
        {
            "role": "user",
            "content": query,
        },
    ]
]
prompt = tokenizer.apply_chat_template(  # type: ignore
    conversation=input,  # type: ignore
    tokenize=False,
    add_generation_prompt=True,
)[0]

In [806]:
prompt

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow do you make cheese?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [807]:
partial_message = ""
for token in client.text_generation(prompt=prompt, **client_kwargs):
    partial_message += token
    print(partial_message)

Making
Making cheese
Making cheese is
Making cheese is a
Making cheese is a fascinating
Making cheese is a fascinating process
Making cheese is a fascinating process that
Making cheese is a fascinating process that involves
Making cheese is a fascinating process that involves transforming
Making cheese is a fascinating process that involves transforming milk
Making cheese is a fascinating process that involves transforming milk into
Making cheese is a fascinating process that involves transforming milk into a


---