In [1]:
from llama_index.core import Document
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader,StorageContext
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage  
from IPython.display import Markdown, display
import gradio as gr
from llama_index.core.prompts import PromptTemplate
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core import Settings
from dotenv import load_dotenv
import os
from openai import OpenAI
import openai
from llama_index.core import ServiceContext
from llama_index.llms.openai import OpenAI as LlamaOpenAI  # LlamaIndex-compatible
from llama_index.vector_stores.elasticsearch import ElasticsearchStore

In [2]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = OpenAI(api_key=openai_api_key)
ELASTIC_API_KEY=os.getenv("ELASTIC_API_KEY")
ELASTIC_URL=os.getenv("ELASTIC_URL")
ELASTIC_INDEX="demo"

In [3]:
DEFAULT_MODEL="LLaMA 3.2 (Ollama)"
SUPPORTED_MODELS = {
    "OpenAI GPT-4": {"provider": "openai", "model_name": "gpt-4"},
    DEFAULT_MODEL: {"provider": "ollama", "model_name": "llama3.2"}
}

llm_cache = {}
for model_name, config in SUPPORTED_MODELS.items():
    if config["provider"] == "openai":
        llm_cache[model_name] = LlamaOpenAI(model_name=config["model_name"])
    elif config["provider"] == "ollama":
        llm_cache[model_name] = Ollama(model=config["model_name"], 
                                       request_timeout=120, 
                                       base_url="http://127.0.0.1:11434")

In [4]:
def get_llm_from_choice(choice: str):
    config = SUPPORTED_MODELS[choice]
    if choice not in llm_cache:
      raise ValueError("Unsupported provider")
    
    return llm_cache[choice]    

In [5]:
documentDir="llama-vision"
vectorDbLocation="demo_vectorDB"
indexStorage="indexStorage"

system_message = "You are a helpful assistant"

template_str = """
You are a cybersecurity expert assistant analyzing threat intelligence reports. You provide detailed, technical responses based on stored security knowledge for security analysts and incident responders.

Context Information:
{context_str}

Analyst Question:
{query_str}

Instructions:
- Provide technical details when available
- Reference specific threat actors, TTPs, or indicators if mentioned in the context
- If the context doesn't contain relevant information, clearly state this
- Format your response for security professionals

Response:
"""

prompt_template = PromptTemplate(template_str)

In [18]:
documents = SimpleDirectoryReader(documentDir).load_data()
for doc in documents:
    filename = doc.metadata.get("file_path", "")
    import re
    match = re.search(r"page[_\-]?(\d+)", filename.lower())
    if match:
        doc.metadata["page"] = int(match.group(1))


In [11]:
documents[:1]

[Document(id_='720f38ea-072c-47e6-84de-95be31d9d0a6', embedding=None, metadata={'file_path': '/teamspace/studios/this_studio/llama-vision/page1.md', 'file_name': 'page1.md', 'file_type': 'text/markdown', 'file_size': 1694, 'creation_date': '2025-06-06', 'last_modified_date': '2025-06-06', 'page': 1}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='The image presents the cover of CrowdStrike\'s 2025 Global Threat Report, featuring a striking visual representation of three characters in a futuristic cityscape. The title, "CROWDSTRIKE 2025 GLOBAL THREAT REPORT," is prominently displayed in bold red font at the top of the image.\n\n**Key Eleme

In [6]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import MarkdownNodeParser
embed_model = OllamaEmbedding(model_name="nomic-embed-text")  # Ollama's embedding model
#embed_model =  OpenAIEmbedding()

In [20]:
md_parser = MarkdownNodeParser()
md_nodes = md_parser.get_nodes_from_documents(documents)


md_nodes

In [21]:
semantic_parser = SemanticSplitterNodeParser(
    embed_model=embed_model,
    chunk_size=1024,
    chunk_overlap=128
)
semantic_nodes = semantic_parser.build_semantic_nodes_from_documents(md_nodes)

In [15]:
for i, node in enumerate(semantic_nodes[:5]):
    print(f"Node {type(node)}:")
    print(f"  Text: {node.text[:100]}") 
    print(f"  embeddin: {node}\n")

Node <class 'llama_index.core.schema.TextNode'>:
  Text: The image presents the cover of CrowdStrike's 2025 Global Threat Report, featuring a striking visual
  embeddin: Node ID: 6831c7dc-8690-4bd1-bd74-cc3ac1dc2cc4
Text: The image presents the cover of CrowdStrike's 2025 Global Threat
Report, featuring a striking visual representation of three characters
in a futuristic cityscape. The title, "CROWDSTRIKE 2025 GLOBAL THREAT
REPORT," is prominently displayed in bold red font at the top of the
image.  **Key Elements:**  * **Title:** CROWDSTRIKE 2025 GLOBAL THREAT
R...

Node <class 'llama_index.core.schema.TextNode'>:
  Text: Overall, the image is well-designed and effectively communicates the key message of the report.
  embeddin: Node ID: cb68eb9f-0f88-46f4-9140-21248fa23c67
Text: Overall, the image is well-designed and effectively communicates
the key message of the report.

Node <class 'llama_index.core.schema.TextNode'>:
  Text: **The Growing Reliance on Identity Attacks and Vulnerab

In [7]:
vector_store = ElasticsearchStore(
    es_url=ELASTIC_URL,  
    index_name=ELASTIC_INDEX,
    es_api_key=ELASTIC_API_KEY
)

In [23]:
#If we are storing it
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    semantic_nodes,
    storage_context=storage_context,
    embed_model=embed_model
)

In [8]:
# Create index from existing store
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model
)

In [9]:
from llama_index.core.query_engine import RetrieverQueryEngine

retriever = index.as_retriever(similarity_top_k=3)



In [10]:
def message_model(prompt,model_choice):
    llm_to_use = get_llm_from_choice(model_choice)
    print("\n=== RETRIEVED CONTEXT ===")    
    retrieved_nodes = retriever.retrieve(prompt)
    for i, node in enumerate(retrieved_nodes):
      print(f"Node {i+1} Score: {node.score:.4f}")
      print(f"Content Preview: {node.text[:200]}...")
      print("---")
    query_engine = RetrieverQueryEngine.from_args(
        retriever=retriever,
        llm=llm_to_use,
        text_qa_template=prompt_template,
        verbose=True
    )
    context_str = "\n\n".join([node.text for node in retrieved_nodes])
    formatted_prompt = prompt_template.format(
        context_str=context_str,
        query_str=prompt
    )
    
    print("\n=== FORMATTED PROMPT SENT TO LLM ===")
    print(formatted_prompt)
    print("=== END FORMATTED PROMPT ===\n")
    response = query_engine.query(prompt)
    return response.response

In [11]:
message_model("as per adversary groups whats the name given to Pakistan","LLaMA 3.2 (Ollama)")


=== RETRIEVED CONTEXT ===
Node 1 Score: 1.0000
Content Preview: # Naming Conventions

The following is a list of adversary groups, their nation-state or category, and any notable information about them....
---
Node 2 Score: 0.4693
Content Preview: ### Adversary Groups

| Adversary | Nation-State or Category | Notable Information |
| --- | --- | --- |
| Bear | Russia |  |
| Buffalo | Vietnam |  |
| Chollima | North Korea |  |
| Crane | South Kor...
---
Node 3 Score: 0.0000
Content Preview: ### Section 1: Introduction

During 2024, CrowdStrike Intelligence introduced 26 newly named adversaries, including the Kazakhstan-based adversary COMRADE SAIGA, raising the total number of named adve...
---

=== FORMATTED PROMPT SENT TO LLM ===

You are a cybersecurity expert assistant analyzing threat intelligence reports. You provide detailed, technical responses based on stored security knowledge for security analysts and incident responders.

Context Information:
# Naming Conventions

The follow

'Based on the provided adversary group list, the nation-state or category associated with Pakistan is Leopard. No additional notable information is available about this adversary group.'

In [None]:
gr.Interface(
    fn=message_model,
    inputs=[gr.Textbox(lines=2, label="Ask a question..."),
           gr.Dropdown(list(SUPPORTED_MODELS.keys()), label="Choose Model", value=DEFAULT_MODEL)],
    outputs=gr.Textbox(placeholder="Output text....."), 
    flagging_mode="never"
).launch(share=False,debug=False)