In [3]:
import os
import random
import subprocess
from typing import List, Dict, Any

import numpy as np
import pandas as pd

# Hugging Face
from datasets import load_dataset

# Sentence Transformers for Embeddings
from sentence_transformers import SentenceTransformer

import evaluate

# Chroma
import chromadb
from chromadb.config import Settings

# BEIR
from beir.retrieval.evaluation import EvaluateRetrieval

import ollama


In [3]:
# Create (or load) a Chroma database
client = chromadb.Client(
    Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory="chroma_db"  # Directory to store Chroma data
    )
)

collection_name = "hotpotqa_docs"
# Clean up if collection exists
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(name=collection_name)

hotpot_collection = client.create_collection(name=collection_name)

# Load a Sentence Transformer for embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Load dataset
dataset = load_dataset("hotpot_qa", "fullwiki")
train_data = dataset["train"]

# For demonstration, limit the data
MAX_SAMPLES = 300
train_data = train_data.select(range(min(MAX_SAMPLES, len(train_data))))
print(f"Using {len(train_data)} samples from HotpotQA fullwiki.")


README.md:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

hotpot_qa.py:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/566M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Using 300 samples from HotpotQA fullwiki.


In [5]:
def chunk_text(text: str, chunk_size: int = 200) -> List[str]:
    """
    Simple function to chunk text into segments of ~chunk_size words.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks


In [6]:
doc_id_counter = 0
all_texts = []
all_ids = []

for example in train_data:
    context_list = example["context"]  # List of [title, [sentences]]
    for (title, sentences) in context_list:
        for sentence in sentences:
            # Optionally chunk each sentence
            splitted = chunk_text(sentence, chunk_size=50)
            for chunk in splitted:
                doc_id_counter += 1
                doc_id = f"doc_{doc_id_counter}"
                all_texts.append(chunk)
                all_ids.append(doc_id)

# Compute embeddings
BATCH_SIZE = 256
all_embeddings = []
for i in range(0, len(all_texts), BATCH_SIZE):
    emb_batch = embedding_model.encode(all_texts[i:i+BATCH_SIZE], show_progress_bar=False).tolist()
    all_embeddings.extend(emb_batch)

# Insert into Chroma
hotpot_collection.add(
    documents=all_texts,
    embeddings=all_embeddings,
    ids=all_ids
)

print(f"Inserted {len(all_texts)} total chunks into Chroma.")


ValueError: too many values to unpack (expected 2)

In [7]:
context_list

{'title': ['Radio City (Indian radio station)',
  'History of Albanian football',
  'Echosmith',
  "Women's colleges in the Southern United States",
  'First Arthur County Courthouse and Jail',
  "Arthur's Magazine",
  '2014–15 Ukrainian Hockey Championship',
  'First for Women',
  'Freeway Complex Fire',
  'William Rast'],
 'sentences': [["Radio City is India's first private FM radio station and was started on 3 July 2001.",
   ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).',
   ' It plays Hindi, English and regional songs.',
   ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.',
   ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.',
   ' The Radio station c

In [4]:
r = ollama.embed(
  model='bge-m3',
  input='Llamas are members of the camelid family',
)

In [9]:
r['embeddings'][0]

AttributeError: 'list' object has no attribute 'shape'