In [17]:
import os
from dotenv import load_dotenv

from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader

from langchain_astradb import AstraDBVectorStore
from langchain_core.documents import Document

load_dotenv()

ASTRA_DB_COLLECTION_NAME = os.getenv("ASTRA_DB_COLLECTION_NAME")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [11]:
# Load documents from directory
loader = DirectoryLoader(
    "data", 
    glob="*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'}
)
documents = loader.load()

print(f"Loaded {len(documents)} documents")
print(f"\nFirst document preview:")
print(documents[0].page_content[:200] + "...")

Loaded 3 documents

First document preview:

    Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. Ther...


In [12]:
embeddings=OpenAIEmbeddings(model="text-embedding-3-small",dimensions=1024)

In [16]:
vector_store=AstraDBVectorStore(
    embedding=embeddings,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    collection_name=ASTRA_DB_COLLECTION_NAME,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace=None,
)
vector_store

<langchain_astradb.vectorstores.AstraDBVectorStore at 0x1fb870350f0>

In [18]:
document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [19]:
vector_store.add_documents(documents=documents)

['2009b23291cf4dc2a176f6c1286ae483',
 'b81d7c1caa3b4e0d90c8ab98807d9adb',
 '840a57f4cee94f118954d1ae35114b7d',
 '9331b0f25fb94f28833a4cb50bd1f978',
 '232d1eef5cca47b794bd9f5b32f1765d',
 '134ff4241de8458eb5d1d3666d6c6444',
 'ed1f0ca24ea449b799c96104b94c3a52',
 '33cec180d59a4b14be507d6db0aac879',
 'bd5a93511fdb43969648a4a77cc70e0e',
 '087502e6ca70489ebaba70fab41f0f23']

In [20]:
### Search from Vector Store DB

vector_store.similarity_search("What is the weather")

[Document(id='b81d7c1caa3b4e0d90c8ab98807d9adb', metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(id='bd5a93511fdb43969648a4a77cc70e0e', metadata={'source': 'news'}, page_content='The stock market is down 500 points today due to fears of a recession.'),
 Document(id='840a57f4cee94f118954d1ae35114b7d', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='087502e6ca70489ebaba70fab41f0f23', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [21]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3,
    filter={"source": "tweet"},
)
for res in results:
    print(f'* "{res.page_content}", metadata={res.metadata}')

* "Building an exciting new project with LangChain - come check it out!", metadata={'source': 'tweet'}
* "LangGraph is the best framework for building stateful, agentic applications!", metadata={'source': 'tweet'}
* "Wow! That was an amazing movie. I can't wait to see it again.", metadata={'source': 'tweet'}


In [22]:
results = vector_store.similarity_search_with_score(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3,
    filter={"source": "tweet"},
)
for res, score in results:
    print(f'* [SIM={score:.2f}] "{res.page_content}", metadata={res.metadata}')

* [SIM=0.72] "Building an exciting new project with LangChain - come check it out!", metadata={'source': 'tweet'}
* [SIM=0.71] "LangGraph is the best framework for building stateful, agentic applications!", metadata={'source': 'tweet'}
* [SIM=0.53] "Wow! That was an amazing movie. I can't wait to see it again.", metadata={'source': 'tweet'}


In [23]:
### Retriever
retriever=vector_store.as_retriever(
  search_type="similarity_score_threshold",
    search_kwargs={"k": 1, "score_threshold": 0.5},
)
retriever.invoke("Stealing from the bank is a crime", filter={"source": "news"})

[Document(id='9331b0f25fb94f28833a4cb50bd1f978', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]

In [24]:
### Retriever
retriever=vector_store.as_retriever(
  search_type="mmr",
    search_kwargs={"k": 1},
)
retriever.invoke("Stealing from the bank is a crime", filter={"source": "news"})

[Document(id='9331b0f25fb94f28833a4cb50bd1f978', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]