In [171]:
import chromadb
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext

In [172]:
remote_db = chromadb.HttpClient()
# remote_db.delete_collection("articles")

In [173]:
chroma_collection = remote_db.get_or_create_collection("articles")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [150]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

## Preprocessing

In [33]:
from dotenv import load_dotenv
import os

# Path to your .env file
env_path = '/Users/alexander.girardet/Code/Personal/projects/rag_poc/.env'

# Load the environment variables, get API key
load_dotenv(dotenv_path=env_path) 

True

In [86]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.text_splitter import SentenceSplitter

### Batch process documents

In [37]:
from typing import Dict, Iterable, List, Optional, Union

from llama_index.readers.base import BaseReader
from llama_index.schema import Document

class SimpleMongoReader(BaseReader):
    """Simple mongo reader.

    Concatenates each Mongo doc into Document used by LlamaIndex.

    Args:
        host (str): Mongo host.
        port (int): Mongo port.
    """

    def __init__(
        self,
        host: Optional[str] = None,
        port: Optional[int] = None,
        uri: Optional[str] = None
    ) -> None:
        """Initialize with parameters."""
        try:
            from pymongo import MongoClient
        except ImportError as err:
            raise ImportError(
                "`pymongo` package not found, please run `pip install pymongo`"
            ) from err

        client: MongoClient
        if uri:
            client = MongoClient(uri)
        elif host and port:
            client = MongoClient(host, port)
        else:
            raise ValueError("Either `host` and `port` or `uri` must be provided.")

        self.client = client

    def _flatten(self, texts: List[Union[str, List[str]]]) -> List[str]:
        result = []
        for text in texts:
            result += text if isinstance(text, list) else [text]
        return result

    def lazy_load_data(
        self,
        db_name: str,
        collection_name: str,
        field_names: List[str] = "text",
        separator: str = "",
        query_dict: Optional[Dict] = None,
        max_docs: int = 0,
        metadata_names: Optional[List[str]] = None,
    ) -> Iterable[Document]:
        """Load data from the input directory.

        Args:
            db_name (str): name of the database.
            collection_name (str): name of the collection.
            field_names(List[str]): names of the fields to be concatenated.
                Defaults to ["text"]
            separator (str): separator to be used between fields.
                Defaults to ""
            query_dict (Optional[Dict]): query to filter documents. Read more
            at [official docs](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
                Defaults to None
            max_docs (int): maximum number of documents to load.
                Defaults to 0 (no limit)
            metadata_names (Optional[List[str]]): names of the fields to be added
                to the metadata attribute of the Document. Defaults to None

        Returns:
            List[Document]: A list of documents.

        """
        db = self.client[db_name]
        cursor = db[collection_name].find(filter=query_dict or {}, limit=max_docs)

        for item in cursor:
            try:
                texts = [item[name] for name in field_names]
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err

            texts = self._flatten(texts)
            text = separator.join(texts)

            if metadata_names is None:
                yield Document(text=text)
            else:
                try:
                    metadata = {name: item[name] for name in metadata_names}
                except KeyError as err:
                    raise ValueError(
                        f"{err.args[0]} field not found in Mongo document."
                    ) from err
                yield Document(text=text, metadata=metadata)

In [132]:
query_dict = {}
field_names = ["Article"]
reader = SimpleMongoReader(uri="mongodb://localhost:27017/")
documents = reader.load_data(
    "unstructured_data_store", "articles", metadata_names=["NewsType", "articleId"], field_names=field_names
)

In [134]:
documents[0]

Document(id_='2ee1dba8-a2b5-4e91-a004-d92162622cae', embedding=None, metadata={'NewsType': 'business', 'articleId': 0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='48c17c1bbabbd91708caa50fa82d84c066afb0209ce609d6390135fb31857e39', text='KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources said reduction in fares will be applicable on public transport, rickshaw, taxi and other means of traveling.Meanwhile, Karachi Transport Ittehad (KTI) has refused to abide by the government decision.KTI President Irshad Bukhari said the commuters are charged the lowest fares in Karachi as compare to other parts of the country, adding that 80pc vehicles run on Compressed Natural Gas (CNG). Bukhari said Karachi transporters will cut fares when decrease in CNG prices will be made.                        \r\n\r\n\r\n\r

All the data has been loaded into memory. I reality we would have a batch process, however for now it will do. For the index we will create a batch process.

In [151]:
from llama_index import Document
from llama_index.embeddings import OpenAIEmbedding
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import TitleExtractor
from llama_index.ingestion import IngestionPipeline, IngestionCache

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=25, chunk_overlap=0),
        OpenAIEmbedding(), # Create Local Embedding model
    ]
)

# run the pipeline
nodes = pipeline.run(documents=documents[:10])

Metadata length (10) is close to chunk size (25). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (10) is close to chunk size (25). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (10) is close to chunk size (25). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (10) is close to chunk size (25). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (10) is close to chunk size (25). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (10) is close to chunk size (25). Resulting chunks are less than 50 tokens

In [136]:
nodes[0]

TextNode(id_='43055804-0e05-4b47-8d20-261d8510e68d', embedding=[0.014831758104264736, 0.0011606569169089198, 0.0019601064268499613, -0.005895726382732391, -0.025952152907848358, 0.004765883553773165, -0.0334981344640255, -0.021117793396115303, -0.012373493053019047, -0.020665856078267097, 0.020296089351177216, 0.0025489942636340857, -0.0027544202748686075, -0.0048651727847754955, -0.009579699486494064, -0.007771950680762529, -0.002040565013885498, -0.01570824161171913, 0.022199703380465508, 0.000669774366542697, -0.008203345350921154, 0.012811735272407532, 0.009744040668010712, 0.003807228757068515, 0.00023367207904811949, 0.02052890509366989, 0.016297129914164543, -0.02666429616510868, 0.0006950246752239764, 0.014003206044435501, 0.015612376853823662, -0.014010054059326649, 0.002240855246782303, -0.02115887962281704, -0.022994017228484154, -0.02359660156071186, -0.0007005882798694074, -0.008929183706641197, 0.028006412088871002, -0.029334833845496178, 0.018693767488002777, -0.00112470

In [152]:
index = VectorStoreIndex(nodes, storage_context=storage_context) # Data is loaded into chroma

In [161]:
chroma_collection.count(), len(nodes) # 246 nodes have been loaded into Chroma. 

(1230, 2)

# Querying

This is without any postprocessing.

In [174]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.text_splitter import SentenceSplitter

In [175]:
chroma_collection = remote_db.get_or_create_collection("articles")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [178]:
chroma_collection.count()

133

In [179]:
loaded_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [157]:
query_engine = index.as_query_engine()
response = query_engine.query("What happened in Karachi?")

In [180]:
from llama_index.retrievers import VectorIndexRetriever

retriever = VectorIndexRetriever(
        index=loaded_index,
        similarity_top_k=10,
    )

In [181]:
nodes = retriever.retrieve("What happened in Karachi?") # A list of nodes are returned. You choose the most relevant. Then it will fetch that document from your document store.

In [186]:
nodes[9] # Related to document: 3a7d5ac4-ff1e-4efb-93c3-f8c9054dbf6b 

NodeWithScore(node=TextNode(id_='692f878d-c543-4f9c-a1f3-2a33069e1245', embedding=None, metadata={'NewsType': 'business', 'articleId': 16, 'Date': '1/21/2015'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f0be5991-020a-4fac-bcb8-584c5d88a184', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'NewsType': 'business', 'articleId': 16, 'Date': '1/21/2015'}, hash='a6ced14551c4ce7a0506b00961dcd21f2fb8971c7761cf1b0c43ca4d17d4f18f'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='2a69587a-2604-44af-b80f-7f2a40a60b95', node_type=<ObjectType.TEXT: '1'>, metadata={'NewsType': 'business', 'articleId': 15, 'Date': '1/20/2015'}, hash='25c8a53745f6250af723a611d47680f8a132ba4896082a0fef2cf99a8bd8a75e'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='765ecb13-5956-4051-8f37-56a40ad75532', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='f6ff745a33731a3172d60c4e7ad0698ee6bc68c493e1c4fd557e0

In [189]:
nodes[0].score

0.6986510567420834

In [101]:
documents[0]

Document(id_='3a7d5ac4-ff1e-4efb-93c3-f8c9054dbf6b', embedding=None, metadata={'NewsType': 'business'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='a345f3d5be78e9f7214d83fd9fd807b0bf75106b77fe17ebd0c59867568cf74d', text='KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources said reduction in fares will be applicable on public transport, rickshaw, taxi and other means of traveling.Meanwhile, Karachi Transport Ittehad (KTI) has refused to abide by the government decision.KTI President Irshad Bukhari said the commuters are charged the lowest fares in Karachi as compare to other parts of the country, adding that 80pc vehicles run on Compressed Natural Gas (CNG). Bukhari said Karachi transporters will cut fares when decrease in CNG prices will be made.                        \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r

In [107]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/") # Hosted with Docker

db = client["unstructured_data_store"]

# Access collection
collection = db["articles"]

In [190]:
from bson.objectid import ObjectId

# MongoDB connection strin
# Replace 'your-document-id-here' with the actual ObjectId as a string
document_id = "64a7deab-b1ea-469f-ab85-d37796a0cea7"
document = collection.find_one({"articleId": 3})

In [193]:
document['Article']

'HONG KONG: Asian markets tumbled Tuesday following painful losses in New York and Europe while the euro sat near nine-year lows as political uncertainty in Greece fanned renewed fears it could leave the eurozone.Oil prices, which fell below the psychological $50 a barrel mark in US trade, edged up marginally but remained under pressure owing to a global supply glut, weak demand and a stronger dollar.Tokyo tumbled 2.50 percent, Hong Kong lost 0.89 percent, Sydney eased 1.67 percent, Seoul was 1.30 percent lower while Shanghai reversed earlier losses to gain 0.70 percent.The first full week of the new year got off to a traumatic start for dealers as they bet a January 25 general election in Greece will see a victory for the the left-wing Syriza party.Markets fear the party will roll back austerity measures required under the IMF-EU bailout of the country, which could in turn lead it to exit the eurozone.The year is "barely three trading days old and already the two biggest themes that w

In [None]:
collection.find("id"