In [1]:
DATA_PATH = "../data/"

In [1]:
!pip install fastembed hf_xet -q


[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd

df = pd.read_parquet(DATA_PATH+"articles_chunks.parquet")
df.head()

Unnamed: 0,id,title,categories,abstract_chunk,id_chunk,authors,url,published,year
0,0,Structure-Attribute Transformations with Marko...,cs.LG,title : structure - attribute transformations ...,0,"Zhen Liu, Yongtao Zhang, Shaobo Ren, Yuxin You",http://arxiv.org/abs/2509.21059v1,2025-09-25 12:09:53+00:00,2025
1,1,MAIFormer: Multi-Agent Inverted Transformer fo...,cs.LG,title : maiformer : multi - agent inverted tra...,0,"Seokbin Yoon, Keumjin Lee",http://arxiv.org/abs/2509.21004v1,2025-09-25 10:59:29+00:00,2025
2,2,Why Attention Fails: The Degeneration of Trans...,cs.LG,title : why attention fails : the degeneration...,0,"Zida Liang, Jiayi Zhu, Weiqiang Sun",http://arxiv.org/abs/2509.20942v1,2025-09-25 09:25:51+00:00,2025
3,3,FHRFormer: A Self-supervised Transformer Appro...,"cs.LG, cs.AI, cs.CE, cs.CV",title : fhrformer : a self - supervised transf...,0,"Kjersti Engan, Neel Kanwal, Anita Yeconia, Lad...",http://arxiv.org/abs/2509.20852v1,2025-09-25 07:40:21+00:00,2025
4,4,T2I-Diff: fMRI Signal Generation via Time-Freq...,cs.LG,title : t2i - diff : fmri signal generation vi...,0,"Hwa Hui Tew, Junn Yong Loo, Yee-Fan Tan, Xinyu...",http://arxiv.org/abs/2509.20822v1,2025-09-25 07:08:19+00:00,2025


In [None]:
df_ = df.copy()
df = df_[df_["url"].str.contains("arxiv")]

len(df)

23005

In [5]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 -q

In [6]:
import torch

print(torch.cuda.is_available())

True


In [None]:
from fastembed import TextEmbedding

model_name = 'BAAI/bge-small-en' #'jinaai/jina-embeddings-v2-base-en'

model = TextEmbedding(model_name=model_name, provider="torch", device="cuda")
embeddings_query = list(model.embed([df.iloc[0].abstract_chunk]))

In [20]:
embeddings_query[0].shape

(384,)

In [21]:
from tqdm import tqdm

embeds = []

for _, art in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    vector = list(model.embed(art['abstract_chunk']))[0]
    if len(vector) > 0:
        embeds.append(vector)

Processing: 100%|██████████| 23005/23005 [19:05<00:00, 20.08it/s]


In [None]:
len(embeds), len(df)

(23005, 79337)

In [None]:
df["embed"] = embeds
df.to_parquet(DATA_PATH+"articles_w_embeds.parquet", engine="pyarrow", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["embed"] = embeds


In [6]:
import pandas as pd

df = pd.read_parquet(DATA_PATH+"articles_w_embeds.parquet")
df = df.drop(df[df.title.isna()].index, axis=0)

In [None]:
from qdrant_client import QdrantClient, models
from tqdm import tqdm

def sanitize_payload(art):
    return {
        "id": int(art.id),
        "title": str(art.title or ""),
        "abstract_chunk": str(art.abstract_chunk or ""),
        "categories": str(art.categories or ""),
        "authors": str(art.authors or ""),
        "published": str(art.published or ""),
        "year": str(art.year or ""),
        "url": str(art.url or "")
    }

def init_collection(collection_name="articles-rag", embed_dim=384, metric="cosine"):
    distances = {"cosine": models.Distance.COSINE,
                 "dot": models.Distance.DOT,
                 "euclidean": models.Distance.EUCLID,
                 "manhattan": models.Distance.MANHATTAN}

    try:
        client.delete_collection(collection_name=collection_name)
    except Exception as e:
        print(f"Error: {e}")

    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=embed_dim,
            distance=distances[metric],
            on_disk=False
        )
    )
    return client

def create_vdb(client, data, collection_name="articles-rag", embed_dim=384, metric="cosine", batch_size=1000):
    client = init_collection(collection_name=collection_name, embed_dim=embed_dim, metric=metric)
    
    id_ = 0
    points = []

    for _, art in tqdm(data.iterrows(), total=len(data), desc="Processing"):
        vector = art["embed"]
        
        if vector is None or len(vector) != embed_dim:
            print(f"Skipping id {art.id} — invalid embedding: {vector}")
            continue

        point = models.PointStruct(
            id=id_,
            vector=vector,
            payload=sanitize_payload(art)
        )

        points.append(point)
        id_ += 1
        
        if len(points) == batch_size:
            client.upsert(
                collection_name=collection_name,
                points=points,
                wait=True
            )
            points = []

    if points:
        client.upsert(
            collection_name=collection_name,
            points=points,
            wait=True
        )
    return points

EMBEDDING_DIMENSIONALITY = 384
BATCH_SIZE = 1000
client = QdrantClient("http://localhost:6333", timeout=60.0)

#docker run -p 6333:6333 -p 6334:6334 -v ~/qdrant_storage:/qdrant/storage qdrant/qdrant

In [12]:
points = create_vdb(client=client, data=df, collection_name="articles-rag-cos", embed_dim=EMBEDDING_DIMENSIONALITY, metric="cosine", batch_size=BATCH_SIZE)

Processing: 100%|██████████| 23005/23005 [17:06<00:00, 22.42it/s] 


In [6]:
points = create_vdb(client=client, data=df, collection_name="articles-rag-euc", embed_dim=EMBEDDING_DIMENSIONALITY, metric="euclidean", batch_size=BATCH_SIZE)

Processing: 100%|██████████| 23005/23005 [00:29<00:00, 770.72it/s] 


In [7]:
points = create_vdb(client=client, data=df, collection_name="articles-rag-dot", embed_dim=EMBEDDING_DIMENSIONALITY, metric="dot", batch_size=BATCH_SIZE)

Processing: 100%|██████████| 23005/23005 [00:33<00:00, 680.13it/s] 


In [8]:
points = create_vdb(client=client, data=df, collection_name="articles-rag-manh", embed_dim=EMBEDDING_DIMENSIONALITY, metric="manhattan", batch_size=BATCH_SIZE)

Processing: 100%|██████████| 23005/23005 [00:35<00:00, 650.88it/s] 


In [1]:
!jupyter nbconvert --to script vector_databases.ipynb

[NbConvertApp] Converting notebook vector_databases.ipynb to script
[NbConvertApp] Writing 4859 bytes to vector_databases.py
