# Retrieval-Augmented Generation (RAG) for Cyber Threat Intel

In [1]:
from langchain_community.document_loaders import RSSFeedLoader
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import PyPDFLoader
import config
import os

In [2]:
# Regenerator Vectorstore
REGENERATE_VECTOR_STORE = True      # Set to false to load saved vectordb

# Check if vectordb exists
vectordb_exists = len(os.listdir(config.VECTORSTORE_PERSIST_DIR)) != 0
vectordb_exists = REGENERATE_VECTOR_STORE

In [3]:

if not vectordb_exists:

    # Load CSV Docs
    loader = CSVLoader(file_path='rag_documents/enterprise-attack-v16.csv')
    csv_docs = loader.load()

    # Load RSS Feeds to docs
    all_rss_urls = config.RSS_INTEL_REPORTS_URLS + config.RSS_INTEL_TOOLS_URLS
    loader = RSSFeedLoader(urls=all_rss_urls)
    rss_docs = loader.load()

    # Load PDF Docs
    loader = PyPDFLoader(file_path='rag_documents/ATTACK_Design_and_Philosophy_March_2020.pdf')
    pdf_docs = loader.load()

    all_docs = csv_docs + rss_docs + pdf_docs
    print(all_docs)

Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 98 0 (offset 0)
Ignoring wrong pointing object 101 0 (offset 0)
Ignoring wrong pointing object 103 0 (offset 0)
Ignoring wrong pointing object 117 0 (offset 0)
Ignoring wrong pointing object 119 0 (offset 0)
Ignoring wrong pointing object 121 0 (offset 0)
Ignoring wrong pointing object 130 0 (offset 0)
Ignoring wrong pointing object 142 0 (offset 0)
Ignoring wrong pointing object 173 0 (offset 0)
Ignoring wrong pointing object 225 0 (offset 0)
Ignoring wrong pointing object 243 0 (offset 0)
Ignoring wrong pointing object 275 0 (offset 0)
Ignoring wrong pointing object 312 0 (offset 0)




In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

if not vectordb_exists:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=100,
        add_start_index=True
    )
    all_splits = text_splitter.split_documents(all_docs)
    all_splits


In [5]:
from langchain_ollama import OllamaEmbeddings

local_embeddings = OllamaEmbeddings(model=config.OLLAMA_EMBEDDINGS_MODEL)

In [7]:
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

# Generate vectorstore from docs if vectorstore dir is empty, otherwise load saved vectordb
if not vectordb_exists:
    vectorstore = Chroma.from_documents(documents=filter_complex_metadata(all_splits), embedding=local_embeddings, persist_directory=config.VECTORSTORE_PERSIST_DIR)
else:
    vectorstore = Chroma(embedding_function=local_embeddings, persist_directory=config.VECTORSTORE_PERSIST_DIR)


In [13]:
question = "Summarize recent phishing campaigns."
retriever = vectorstore.as_retriever(search_type=config.VECTORSTORE_SEARCH_TYPE, search_kwargs={"k": 4})
retrieved_docs = retriever.invoke(question)

In [None]:
retrieved_docs

In [None]:
context = ' '.join([doc.page_content for doc in retrieved_docs])
context

In [16]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model=config.OLLAMA_LLM_MODEL)
response = llm.invoke(f"""
    Answer the question according to the context:
        Question: {question}
        Context: {context}
""")

In [None]:
print(response)