REF: https://www.youtube.com/watch?v=0zgYu_9WF7A

In [1]:
from langchain_community.document_loaders import RSSFeedLoader
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import PyPDFLoader
import config
import os

In [2]:
# Check if vectordb exists
vectordb_exists = len(os.listdir(config.VECTORSTORE_PERSIST_DIR)) != 0

In [3]:

if not vectordb_exists:
    
    # Load CSV Docs
    loader = CSVLoader(file_path='rag_documents/enterprise-attack-v16.csv')
    csv_docs = loader.load()

    # Load RSS Feeds to docs
    all_rss_urls = config.RSS_INTEL_REPORTS_URLS + config.RSS_INTEL_TOOLS_URLS
    loader = RSSFeedLoader(urls=all_rss_urls)
    rss_docs = loader.load()

    all_docs = csv_docs + rss_docs
    print(all_docs)

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=100,
    add_start_index=True
)
all_splits = text_splitter.split_documents(all_docs)

In [None]:
all_splits

In [2]:
from langchain_ollama import OllamaEmbeddings

local_embeddings = OllamaEmbeddings(model=config.OLLAMA_EMBEDDINGS_MODEL)

In [5]:
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

# Generate vectorstore from docs if vectorstore dir is empty, otherwise
# load saved vectordb
if not vectordb_exists:
    vectorstore = Chroma.from_documents(documents=filter_complex_metadata(all_splits), embedding=local_embeddings, persist_directory=config.VECTORSTORE_PERSIST_DIR)
else:
    vectorstore = Chroma(embedding_function=local_embeddings, persist_directory=config.VECTORSTORE_PERSIST_DIR)


In [22]:
question = "List some features of Cyber Threat Intel (CTI) Platforms such as OpenCTI."
retriever = vectorstore.as_retriever(search_type=config.VECTORSTORE_SEARCH_TYPE, search_kwargs={"k": 4})
retrieved_docs = retriever.invoke(question)

In [None]:
retrieved_docs

In [None]:
context = ' '.join([doc.page_content for doc in retrieved_docs])
context

In [25]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model="llama3.2:1b")
response = llm.invoke(f"""
    Answer the question according to the context:
        Question: {question}
        Context: {context}
""")

In [None]:
print(response)