# Prereq

1. pip install -r requirements.txt
2. Download https://ollama.com/ and run in CMD "ollama run ollama3"
3. Download Docker https://www.docker.com/ and run "docker run --rm --gpus=all -p 6333:6333 -p 6334:6334 -e QDRANT__GPU__INDEXING=1 qdrant/qdrant:gpu-nvidia-latest"

In [1]:
import os
import json
import yaml
import openai
import glob
import requests
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
import uuid
import tqdm as notebook_tqdm
from mitreattack.stix20 import MitreAttackData

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
knowledge = []

In [3]:
# MITRE DATA
mitre_attack_data = MitreAttackData("data/enterprise-attack.json")
techniques = mitre_attack_data.get_techniques(remove_revoked_deprecated=True)

for ttp in techniques:
    knowledge.append({
        "id": str(uuid.uuid4()),
        "source": "MITRE",
        "title": ttp["name"],
        "content": ttp.get("description", ""),
        "type": "TTP",
        "tags": [
            ttp["id"],  
            ttp.get("external_references", [{}])[0].get("external_id", "")
        ] + ttp.get("x_mitre_platforms", [])
    })

In [4]:
# LOLBAS DATA
with open("data/lolbas.json", "r") as f:
    lolbas_items = json.load(f)

for item in lolbas_items:
    name = item.get("Name")
    description = item.get("Description", "")
    full_paths = [p.get("Path") for p in item.get("Full_Path", [])]
    url = item.get("url", "")
    
    for cmd in item.get("Commands", []):
        command_text = cmd.get("Command", "")
        cmd_desc = cmd.get("Description", "")
        mitre_id = cmd.get("MitreID", "")
        usecase = cmd.get("Usecase", "")
        category = cmd.get("Category", "")
        os_list = cmd.get("OperatingSystem", "")

        knowledge.append({
            "id": str(uuid.uuid4()),
            "source": "LOLBAS",
            "title": name,
            "content": f"{description}\nCommand: {command_text}\n{cmd_desc}\nUsecase: {usecase}\nOS: {os_list}",
            "type": "LOLBIN",
            "tags": [
                mitre_id,
                category,
                *full_paths,
                *([url] if url else [])
            ]
        })

In [5]:
with open("data/analyst_notes.json", "r") as f:
    notes = json.load(f)

for note in notes:
    knowledge.append({
        "id": str(uuid.uuid4()),
        "source": note.get("source", "Analyst"),
        "title": note.get("title", "Untitled Note"),
        "content": note.get("content", ""),
        "type": "ANALYST_NOTE",
        "tags": note.get("tags", [])
    })

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")
qdrant = QdrantClient(host="localhost", port=6333)

In [7]:
collection_name = "soc_kb"
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

  qdrant.recreate_collection(


True

In [8]:
# Upload documents
points = []
for doc in tqdm(knowledge):
    vec = model.encode(doc["content"])
    point = PointStruct(
        id=doc["id"],
        vector=vec,
        payload={k: doc[k] for k in doc if k != "id" and k != "content"}
    )
    points.append(point)

qdrant.upsert(collection_name=collection_name, points=points)

100%|██████████| 1121/1121 [00:28<00:00, 39.06it/s]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [9]:
alerts = [] 

with open("data/sample_alerts.json", "r") as f:
    raw_alerts = json.load(f)

for raw in raw_alerts:
    alerts.append({
        "alert_id": raw.get("alert_id"),
        "timestamp": raw.get("timestamp"),
        "host": raw.get("host"),
        "user": raw.get("username"),
        "process": raw.get("process"),
        "command_line": raw.get("command_line"),
        "parent_process": raw.get("parent_process"),
        "event_id": raw.get("event_id"),
        "severity": raw.get("severity"),
        "detection_rule": raw.get("tags"),
        "description": raw.get("description"),
        "context": [], 
        "raw": raw
    })

df = pd.DataFrame(alerts)

df.to_csv("data/output/normalized_alerts.csv", index=False)

In [10]:
model = SentenceTransformer("all-MiniLM-L6-v2")
qdrant = QdrantClient(host="localhost", port=6333)
alerts = pd.read_csv("data/output/normalized_alerts.csv")

In [11]:
# Retrieve context for each command
context_results = []
for cmd in alerts["command_line"]:
    vec = model.encode(cmd)
    results = qdrant.search(
        collection_name="soc_kb",
        query_vector=vec,
        limit=5
    )
    context = [r.payload | {"score": r.score} for r in results]
    context_results.append(context)

alerts["context"] = context_results
alerts.to_pickle("data/output/enriched_alerts.pkl")

  results = qdrant.search(


In [12]:
alerts = pd.read_pickle("data/output/enriched_alerts.pkl")

In [13]:
def classify(alert):
    context = alert['context']
    ctx_text = "\n".join(
        [f"[{x['source']}] {x['title']} - {x.get('score', 0):.2f}\n{x.get('content', '')[:300]}" for x in context]
    )

    prompt = f"""
Given the alert below:
Command Line: {alert['command_line']}
Parent Process: {alert['parent_process']}

Context:
{ctx_text}

Classify as True Positive or False Positive. Explain why.
"""

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": "llama3",
            "prompt": prompt.strip(),
            "stream": False,
            "temperature": 0.3
        }
    )

    if response.status_code == 200:
        return response.json()["response"].strip()
    else:
        return f"Error: {response.status_code} - {response.text}"

In [16]:
classified_alerts = []

for idx, alert in alerts.iterrows():
    print(f"\nAlert #{idx + 1}")
    print(f"Timestamp: {alert['timestamp']}")
    print(f"Host: {alert['host']}")
    print(f"User: {alert['user']}")
    print(f"Command Line: {alert['command_line']}")
    print(f"Parent Process: {alert['parent_process']}")
    print(f"Detection Rule: {alert['detection_rule']}")
    print(f"Severity: {alert['severity']}")
    

    try:
        result = classify(alert)
        print("\n--- LLM Classification Result ---")
        print(result)

        alert["llm_classification"] = result
        classified_alerts.append(alert)

    except Exception as e:
        print(f"Error during classification: {str(e)}")


Alert #1
Timestamp: 2025-07-03T14:32:11Z
Host: WIN-CLIENT01
User: jdoe
Command Line: powershell.exe -nop -w hidden -EncodedCommand SQBFAFg...
Parent Process: explorer.exe
Detection Rule: ['T1059.001', 'Execution', 'Scripting', 'EncodedCommand']
Severity: High

--- LLM Classification Result ---
A fun one!

Based on the alert, I would classify this as a **True Positive**.

Here's why:

* The Command Line contains an encoded PowerShell command (`-EncodedCommand SQBFAFg...`). This suggests that someone is trying to execute a PowerShell command in a way that's designed to avoid detection.
* The Parent Process is `explorer.exe`, which is a legitimate Windows process. However, this doesn't necessarily mean it's a false positive, as attackers often use legitimate processes to launch malicious payloads.
* The other entries listed are LOLBAS (Living Off the Land Binary and Script) tools, which are known techniques used by attackers to execute malicious code without leaving any obvious traces. T