In [1]:
from datasets import load_dataset

data = load_dataset("neuralwork/arxiver", split="train[:100]")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
weaviate_data = data.map(lambda x: {
    "paper_id": x["id"],
    "title": x["title"],
    "abstract": x["abstract"],
    "authors": x["authors"],
    "published_date": x["published_date"],
    "link": x["link"],
    "markdown": x["markdown"],
}).remove_columns(["id"])

In [3]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure
from weaviate.util import generate_uuid5
from weaviate.collections.classes.config import ConsistencyLevel
from weaviate.classes.query import Rerank

In [4]:
client = weaviate.connect_to_local()

In [5]:
client.collections.delete_all()

In [6]:
ai_papers = client.collections.create(
        name="AI_Papers",
        properties=[
            Property(name="paper_id", data_type=DataType.TEXT),
            Property(name="title", data_type=DataType.TEXT),
            Property(name="abstract", data_type=DataType.TEXT),
            Property(name="authors", data_type=DataType.TEXT),
            Property(name="published_date", data_type=DataType.DATE),
            Property(name="link", data_type=DataType.TEXT),
            Property(name="markdown", data_type=DataType.TEXT),
        ],
        vectorizer_config=[
            Configure.NamedVectors.text2vec_transformers(
                name="mxbai_abstract",
                source_properties=["abstract"],
                inference_url="http://t2v-transformers-mixedbread-ai-mxbai-embed:8080",
                vectorize_collection_name=False,
                vector_index_config=Configure.VectorIndex.hnsw(
                    quantizer=Configure.VectorIndex.Quantizer.pq(),
                ),
            ),
            Configure.NamedVectors.text2vec_transformers(
                name="m3_abstract",
                source_properties=["abstract"],
                vectorize_collection_name=False,
                vector_index_config=Configure.VectorIndex.flat(),
            ),
        ],
        generative_config=Configure.Generative.ollama(
            api_endpoint="http://generative-ollama:11434",
            model="aya-expanse:8b",
        ),
        reranker_config=Configure.Reranker.transformers(),
    )

In [7]:
collection = client.collections.get("AI_Papers")

In [8]:
with collection.batch.dynamic() as batch:
    for d in weaviate_data:
        batch.add_object(properties=d, uuid=generate_uuid5(d["paper_id"]))
    batch.flush()

In [9]:
client.close()