# Retrieval-Augmented Generation (RAG) for Cyber Threat Intel

In [1]:
from langchain_community.document_loaders import RSSFeedLoader
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import PyPDFLoader
import config
import os

In [27]:
# Regenerator Vectorstore
REGENERATE_VECTOR_STORE = False      # Set to false to load saved vectordb

# Check if vectordb exists
vectordb_exists = len(os.listdir(config.VECTORSTORE_PERSIST_DIR)) != 0
vectordb_exists = not REGENERATE_VECTOR_STORE

In [29]:
if not vectordb_exists:

    # Load CSV Docs
    loader = CSVLoader(file_path='rag_documents/enterprise-attack-v16.csv')
    csv_docs = loader.load()

    # Load RSS Feeds to docs
    all_rss_urls = config.RSS_INTEL_REPORTS_URLS + config.RSS_INTEL_TOOLS_URLS
    loader = RSSFeedLoader(urls=all_rss_urls)
    rss_docs = loader.load()

    # Load PDF Docs
    loader = PyPDFLoader(file_path='rag_documents/ATTACK_Design_and_Philosophy_March_2020.pdf')
    pdf_docs = loader.load()

    all_docs = csv_docs + rss_docs + pdf_docs
    print(all_docs[:5])

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

if not vectordb_exists:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=100,
        add_start_index=True
    )
    all_splits = text_splitter.split_documents(all_docs)
    all_splits


In [5]:
from langchain_ollama import OllamaEmbeddings

local_embeddings = OllamaEmbeddings(model=config.OLLAMA_EMBEDDINGS_MODEL)

In [8]:
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

# Generate vectorstore from docs if vectorstore dir is empty, otherwise load saved vectordb
if not vectordb_exists:
    # Split the documents into smaller batches (https://community.openai.com/t/error-while-reading-pdf-file-using-openai-chromadb-module/883612)
    batch_size = 5461  # Set to the maximum allowed batch size
    for i in range(0, len(all_splits), batch_size):
        batch = all_splits[i:i + batch_size]
        vectorstore = Chroma.from_documents(documents=filter_complex_metadata(batch), embedding=local_embeddings, persist_directory=config.VECTORSTORE_PERSIST_DIR)
else:
    vectorstore = Chroma(embedding_function=local_embeddings, persist_directory=config.VECTORSTORE_PERSIST_DIR)


In [16]:
question = "Summarize the ATT&CK Design Philosophy."
retriever = vectorstore.as_retriever(search_type=config.VECTORSTORE_SEARCH_TYPE, search_kwargs={"k": 4})
retrieved_docs = retriever.invoke(question)

In [17]:
retrieved_docs

[Document(metadata={'page': 3, 'source': 'rag_documents/ATTACK_Design_and_Philosophy_March_2020.pdf', 'start_index': 0}, page_content='v ©2020 The MITRE Corporation. All Rights Reserved Approved for Public Release. Distribution unlimited 19-01075-28. \nExecutive Summary This paper discusses the motivation behind the creation of ATT&CK, the components described within it, its design philosophy, how the project has progressed, and how it can be used. It is meant to be used as an authoritative source of information about ATT&CK as well as a guide for how ATT&CK is maintained and how the ATT&CK methodology is applied to create knowledge bases for new domains.'),
 Document(metadata={'page': 4, 'source': 'rag_documents/ATTACK_Design_and_Philosophy_March_2020.pdf', 'start_index': 0}, page_content='vi ©2020 The MITRE Corporation. All Rights Reserved Approved for Public Release. Distribution unlimited 19-01075-28. \nPreface This paper documents the published version of ATT&CK as of March 2020 w

In [18]:
context = ' '.join([doc.page_content for doc in retrieved_docs])
context

'v ©2020 The MITRE Corporation. All Rights Reserved Approved for Public Release. Distribution unlimited 19-01075-28. \nExecutive Summary This paper discusses the motivation behind the creation of ATT&CK, the components described within it, its design philosophy, how the project has progressed, and how it can be used. It is meant to be used as an authoritative source of information about ATT&CK as well as a guide for how ATT&CK is maintained and how the ATT&CK methodology is applied to create knowledge bases for new domains. vi ©2020 The MITRE Corporation. All Rights Reserved Approved for Public Release. Distribution unlimited 19-01075-28. \nPreface This paper documents the published version of ATT&CK as of March 2020 with the addition of sub-techniques. MITRE has announced plans to evolve and expand ATT&CK throughout 2020 [1]. This paper will be maintained as a living document and will be updated as significant changes are made to ATT&CK and the process used to maintain the content wit

In [19]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model=config.OLLAMA_LLM_MODEL)
response = llm.invoke(f"""
    Answer the question according to the context:
        Question: {question}
        Context: {context}
""")

In [30]:
from IPython.display import display, Markdown, Latex
display(Markdown(response))

The ATT&CK Design Philosophy is based on a collaborative effort between researchers at The MITRE Corporation to create a comprehensive framework for understanding and mitigating advanced threat actors' (ATPs) tactics, techniques, and procedures (TTPs). Here are the key components described within it:

1. **Components**: The design philosophy includes several key components that address different aspects of ATPs:
   - **Behavioral Analysis Network**: A network of human analysts who analyze publicly available data to identify trends and patterns.
   - **Network Analysis Framework**: A methodology for analyzing networks, including identifying relationships between actors, systems, and other entities.
   - **TTP Identification Tool (TIT)**: A tool used to automatically identify known TTPs in publicly available data.

2. **Design Philosophy**: The ATT&CK design philosophy emphasizes the importance of a human-centered approach:
   - **Human-centric approach**: Understanding how ATPs operate, plan, and adapt to different situations.
   - **Collaboration and community engagement**: Building partnerships with practitioners, researchers, and organizations to share insights and best practices.

3. **Progress and future developments**: The ATT&CK methodology has been refined through various iterations, including the addition of sub-techniques:
   - **Sub-technique additions**: New techniques are continually being added to improve the accuracy and effectiveness of the framework.
   - **Evolution of the design philosophy**: MITRE plans to evolve and expand ATT&CK throughout 2020, reflecting changing threat landscape and advancements in technology.

4. **Usage and community engagement**: The design philosophy is used as an authoritative source for knowledge about ATPs:
   - **Guide for maintaining content**: ATT&CK methodology is applied to create knowledge bases for new domains.
   - **Training programs**: Community workshops provide forums for practitioners to share insights, use cases, and collaborative approaches.

5. **Upcoming events**: ATT&CKcon 5.0 will be held in October 2020, featuring virtual and in-person attendance:
   - **Call for presentations**: An open process for submitting speaker proposals.
   - **Sponsorship opportunities**: Companies interested in sponsoring the event can reach out to attackcon@mitre.org.

The ATT&CK design philosophy prioritizes collaboration, human-centered understanding of ATPs, and continuous improvement through community engagement.