In [1]:
from llama_index.core import Document
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader,StorageContext
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage  
from IPython.display import Markdown, display
import gradio as gr
from llama_index.core.prompts import PromptTemplate
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core import Settings
from dotenv import load_dotenv
import os
from openai import OpenAI
import openai
from llama_index.core import ServiceContext
from llama_index.llms.openai import OpenAI as LlamaOpenAI  # LlamaIndex-compatible

In [2]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = OpenAI(api_key=openai_api_key)

In [3]:
DEFAULT_MODEL="LLaMA 3.2 (Ollama)"
SUPPORTED_MODELS = {
    "OpenAI GPT-4": {"provider": "openai", "model_name": "gpt-4"},
    DEFAULT_MODEL: {"provider": "ollama", "model_name": "llama3.2"}
}

llm_cache = {}
for model_name, config in SUPPORTED_MODELS.items():
    if config["provider"] == "openai":
        llm_cache[model_name] = LlamaOpenAI(model_name=config["model_name"])
    elif config["provider"] == "ollama":
        llm_cache[model_name] = Ollama(model=config["model_name"], 
                                       request_timeout=120, 
                                       base_url="http://127.0.0.1:11434")

In [4]:
def get_llm_from_choice(choice: str):
    config = SUPPORTED_MODELS[choice]
    if choice not in llm_cache:
      raise ValueError("Unsupported provider")
    
    return llm_cache[choice]    

In [5]:
documentDir="docs/page_markdowns"
vectorDbLocation="demo_vectorDB"
indexStorage="indexStorage"

system_message = "You are a helpful assistant"

template_str = """
You are a cybersecurity expert assistant analyzing threat intelligence reports. You provide detailed, technical responses based on stored security knowledge for security analysts and incident responders.

Context Information:
{context_str}

Analyst Question:
{query_str}

Instructions:
- Provide technical details when available
- Reference specific threat actors, TTPs, or indicators if mentioned in the context
- If the context doesn't contain relevant information, clearly state this
- Format your response for security professionals

Response:
"""

prompt_template = PromptTemplate(template_str)

In [6]:
documents = SimpleDirectoryReader(documentDir).load_data()
for doc in documents:
    filename = doc.metadata.get("file_path", "")
    import re
    match = re.search(r"page[_\-]?(\d+)", filename.lower())
    if match:
        doc.metadata["page"] = int(match.group(1))


In [7]:
documents[:1]

[Document(id_='418d2a22-2862-4eae-ac03-91d7a4e309b5', embedding=None, metadata={'file_path': '/teamspace/studios/this_studio/docs/page_markdowns/page_1.md', 'file_name': 'page_1.md', 'file_type': 'text/markdown', 'file_size': 43, 'creation_date': '2025-06-05', 'last_modified_date': '2025-06-05', 'page': 1}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='2 0 2 5\nG L O B A L T H R E A T\nR E P O R T', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}')]

In [8]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import MarkdownNodeParser
embed_model =  OllamaEmbedding(model_name="qwen2.5")
#embed_model =  OpenAIEmbedding()

In [9]:
md_parser = MarkdownNodeParser()
md_nodes = md_parser.get_nodes_from_documents(documents)

md_nodes

In [10]:
semantic_parser = SemanticSplitterNodeParser(
    embed_model=embed_model,
    chunk_size=1024,
    chunk_overlap=128
)
semantic_nodes = semantic_parser.build_semantic_nodes_from_documents(md_nodes)

In [15]:
for i, node in enumerate(semantic_nodes[:5]):
    print(f"Node {type(node)}:")
    print(f"  Text: {node.text[:100]}") 
    print(f"  embeddin: {node}\n")

Node <class 'llama_index.core.schema.TextNode'>:
  Text: 2 0 2 5
G L O B A L T H R E A T
R E P O R T
  embeddin: Node ID: de7f4b8c-e1ef-4385-8166-77ec1a3305ac
Text: 2 0 2 5 G L O B A L T H R E A T R E P O R T

Node <class 'llama_index.core.schema.TextNode'>:
  Text: # The Growing Reliance on Identity Attacks and Vulnerability Exploits

Every breach starts with init
  embeddin: Node ID: e604a3fb-4104-4423-98dc-f0470a1926e0
Text: # The Growing Reliance on Identity Attacks and Vulnerability
Exploits  Every breach starts with initial access, and identity-based
attacks are among the most effective entry methods. Instead of
traditional malware, adversaries favor faster and stealthier methods
such as vishing, social engineering, access broker services, and
trusted relationshi...

Node <class 'llama_index.core.schema.TextNode'>:
  Text: As adversaries scale identity-based attacks and vulnerability exploitation, organizations must adopt
  embeddin: Node ID: fc0c8332-c053-4779-8219-fd3f7478cec3


In [13]:
chroma_client = chromadb.PersistentClient(path=vectorDbLocation)
chroma_collection = chroma_client.get_or_create_collection("crowdstrike_docs")
vector_store = ChromaVectorStore(chroma_client=chroma_client, chroma_collection=chroma_collection,hybrid=True)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
    semantic_nodes,
    storage_context=storage_context,
    embed_model=embed_model
)

In [16]:
from llama_index.core.query_engine import RetrieverQueryEngine

retriever = index.as_retriever(similarity_top_k=3)



In [17]:
def message_model(prompt,model_choice):
    print(prompt)    
    llm_to_use = get_llm_from_choice(model_choice)
    
    print("\n=== RETRIEVED CONTEXT ===")    
    retrieved_nodes = retriever.retrieve(prompt)
    for i, node in enumerate(retrieved_nodes):
      print(f"Node {i+1} Score: {node.score:.4f}")
      print(f"Content Preview: {node.text[:200]}...")
      print("---")
        
    query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever,
    llm=llm_to_use,
    text_qa_template=prompt_template,
    verbose=True
    )
    
    context_str = "\n\n".join([node.text for node in retrieved_nodes])
    formatted_prompt = prompt_template.format(
        context_str=context_str,
        query_str=prompt
    )
    
    print("\n=== FORMATTED PROMPT SENT TO LLM ===")
    print(formatted_prompt)
    print("=== END FORMATTED PROMPT ===\n")
    
    response = query_engine.query(prompt)
    return response.response

In [18]:
Which year of report is available

SyntaxError: invalid syntax (307704984.py, line 1)

In [None]:
gr.Interface(
    fn=message_model,
    inputs=[gr.Textbox(lines=2, label="Ask a question..."),
           gr.Dropdown(list(SUPPORTED_MODELS.keys()), label="Choose Model", value=DEFAULT_MODEL)],
    outputs=gr.Textbox(placeholder="Output text....."), 
    flagging_mode="never"
).queue().launch(share=True,debug=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://10b80b0646086992ef.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


report for which year is available

=== RETRIEVED CONTEXT ===
Node 1 Score: 0.0000
Content Preview: ## CrowdStrike Threat Graph
Uses cloud-scale AI to correlate trillions of data points from multiple telemetry sources to identify shifts in adversarial tactics and map tradecraft to automatically pred...
---
Node 2 Score: 0.0000
Content Preview: ### Educate users
- Provide regular security awareness training
- Teach employees to recognize phishing attempts and social engineering tactics...
---

=== FORMATTED PROMPT SENT TO LLM ===

You are a cybersecurity expert assistant analyzing threat intelligence reports. You provide detailed, technical responses based on stored security knowledge for security analysts and incident responders.

Context Information:
## CrowdStrike Threat Graph
Uses cloud-scale AI to correlate trillions of data points from multiple telemetry sources to identify shifts in adversarial tactics and map tradecraft to automatically predict and prevent threats in real time a