# BBC Dataset Ingestion
Use this notebook to load the BBC news dataset and push it into the local Weaviate instance.

In [None]:
from pathlib import Path
import sys

PROJECT_ROOT = Path().resolve()
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.append(str(SRC_ROOT))


In [None]:
from dataclasses import asdict
from typing import Iterable, Sequence

import weaviate
from datasets import Dataset, load_dataset
from tqdm.auto import tqdm
from weaviate.classes.config import Configure, DataType, Property

from news_chatbot.config import load_weaviate_settings


In [None]:
COLLECTION_NAME = "BBCArticle"
BATCH_SIZE = 64

def ensure_collection(client: weaviate.WeaviateClient) -> weaviate.collections.Collection:
    """Create the collection if it does not already exist."""
    if client.collections.exists(COLLECTION_NAME):
        return client.collections.get(COLLECTION_NAME)

    client.collections.create(
        name=COLLECTION_NAME,
        vectorizer_config=Configure.Vectorizer.text2vec_transformers(),
        properties=[
            Property(name="news_id", data_type=DataType.TEXT),
            Property(name="article", data_type=DataType.TEXT),
            Property(name="summary", data_type=DataType.TEXT),
        ],
    )
    return client.collections.get(COLLECTION_NAME)


In [None]:
def batched(rows: Iterable[dict], size: int) -> Iterable[Sequence[dict]]:
    """Yield dictionaries from the dataset in fixed-size bundles."""
    batch: list[dict] = []
    for row in rows:
        batch.append(row)
        if len(batch) == size:
            yield batch
            batch = []
    if batch:
        yield batch


In [None]:
def ingest() -> None:
    """Load the BBC dataset and write it into Weaviate using dynamic batching."""
    dataset: Dataset = load_dataset("shwet/BBC_NEWS", split="train")
    total_rows = len(dataset)
    settings = load_weaviate_settings()

    with weaviate.connect_to_local(
        host=settings.host,
        port=settings.port,
        grpc_port=settings.grpc_port,
        headers=settings.headers,
    ) as client:
        collection = ensure_collection(client)

        with collection.batch.dynamic() as writer, tqdm(
            total=total_rows,
            desc="Ingesting BBC articles",
            unit="rows",
        ) as progress:
            for rows in batched(dataset, BATCH_SIZE):
                for row in rows:
                    writer.add_object(
                        properties={
                            "news_id": str(row["ids"]),
                            "article": row["articles"],
                            "summary": row["summary"],
                        },
                    )
                progress.update(len(rows))


In [None]:
# Run the ingestion (uncomment to execute)
# ingest()


In [None]:
settings = load_weaviate_settings()
with weaviate.connect_to_local(
    host=settings.host,
    port=settings.port,
    grpc_port=settings.grpc_port,
    headers=settings.headers,
) as client:
    collection = client.collections.get(COLLECTION_NAME)
    response = collection.query.near_text(
        query="Give me news about London",
        limit=1,
        return_metadata=["distance", "score"],
    )
    asdict(response)
