In [None]:
!pip install -r lancedb-dev/requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Load example embeddings from HF (use local cache)

In [3]:
from datasets import load_dataset
import os

# Use offline datasets
os.environ["HF_DATASETS_OFFLINE"] = "1"

# Load a sample dataset from HuggingFace with pre-computed embeddings
sample_dataset = load_dataset("sunhaozhepy/ag_news_sbert_keywords_embeddings", split="test[:1000]")
print(f"Loaded {len(sample_dataset)} samples")
print(f"Sample features: {sample_dataset.features}")
print(f"Column names: {sample_dataset.column_names}")

# Preview the first sample
print(sample_dataset[0])

# Get embedding dimension
vector_dim = len(sample_dataset[0]["keywords_embeddings"])
print(f"Embedding dimension: {vector_dim}")

  from .autonotebook import tqdm as notebook_tqdm


Loaded 1000 samples
Sample features: {'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None), 'keywords': Value(dtype='string', id=None), 'keywords_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}
Column names: ['text', 'label', 'keywords', 'keywords_embeddings']
{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.", 'label': 2, 'keywords': 'pension, disappointed, unions', 'keywords_embeddings': [-0.04149172827601433, 0.10335735976696014, 0.02729571796953678, -0.01626148633658886, -0.006845010910183191, 0.030066702514886856, 0.026151811704039574, 0.05935041606426239, 0.05747139826416969, -0.015899423509836197, 0.027972225099802017, 0.089542455971241, -0.007342466153204441, 0.006704241968691349, -0.024866420775651932, 0.004836398642510176, -0.011853741481900215, 0.0324692

## Local: init local table connection

In [None]:
import lancedb

uri = "data/sample-lancedb"
db = lancedb.connect(uri)

## Create a table and ingest data

In [5]:
import pyarrow as pa

table_name = "lancedb-local-quickstart"

# Create a table with the dataset
table = db.create_table(table_name, data=sample_dataset, mode="overwrite")

# Convert list to fixedsizelist on the vector column
table.alter_columns(dict(path="keywords_embeddings", data_type=pa.list_(pa.float32(), vector_dim)))
print(f"Table '{table_name}' created successfully")

Table 'lancedb-local-quickstart' created successfully


[90m[[0m2025-05-09T03:49:53Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /home/admin/workspace/localdb/sample-lancedb/lancedb-local-quickstart.lance, it will be created


## Create a vector index

In [6]:
from datetime import timedelta

# Create a vector index and wait for it to complete
table.create_index("cosine", vector_column_name="keywords_embeddings")
print(table.index_stats("keywords_embeddings_idx"))

IndexStatistics(num_indexed_rows=1000, num_unindexed_rows=0, index_type='IVF_PQ', distance_type='cosine', num_indices=1, loss=643.3737193197012)


## Perform a vector search

In [7]:
import os
os.environ["HF_DATASETS_OFFLINE"] = "1"

query_dataset = load_dataset("sunhaozhepy/ag_news_sbert_keywords_embeddings", split="test[5000:5001]")
print(f"Query keywords: {query_dataset[0]['keywords']}")
query_embed = query_dataset["keywords_embeddings"][0]

# A vector search
result = (
    table.search(query_embed)
    .select(["text", "keywords", "label"])
    .limit(5)
    .to_pandas()
)
print("Search results:")
print(result)

# A vector search with a filter
filtered_result = (
    table.search(query_embed)
    .where("label > 2")
    .select(["text", "keywords", "label"])
    .limit(5)
    .to_pandas()
)
print("Filtered search results (label > 2):")
print(filtered_result)

Query keywords: toyota, profit, carmaker
Search results:
                                                text  \
0  The Hunt for a Hybrid The Aug. 23 front-page a...   
1  GM pulls Corvette ad with underage driver DETR...   
2  Toy store profits R back up TOY retailer Toys ...   
3  Clicking on Profits The latest data from the U...   
4  GM pulls Guy Ritchie car ad after protest Prot...   

                   keywords  label  _distance  
0      prius, civic, toyota      2   0.745241  
1  corvette, commercial, gm      2   0.878651  
2        toys, toy, profits      2   0.918450  
3  profits, commerce, sales      2   0.972514  
4         car, corvette, ad      2   0.983316  
Filtered search results (label > 2):
                                                text  \
0  Does Nick Carr matter? Strategybusiness conclu...   
1  European Union Extends Review of Microsoft Dea...   
2  IT seeing steady but slow growth: Forrester pr...   
3  IBM Buys Two Danish Services Firms IBM said Tu...   
4