In [21]:
!pip install langchain-experimental langchain-community langchain networkx langchain-google-genai langchain-core json-repair tiktoken



In [5]:
import re
import json
import os

def parse_rules_file(rules_file_path):
    """
    Parses a Suricata rules file and extracts relevant information
    :param rules_file_path: Path to the .rules file
    :return: A list of dictionaries with extracted rule information
    """
    extracted_data = []

    # Regex pattern to extract msg, classtype, and sid
    rule_pattern = re.compile(
        r'msg:"(?P<msg>.*?)";.*?classtype:(?P<classtype>[^;]+);.*?sid:(?P<sid>\d+);'
    )

    with open(rules_file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, 1):
            line = line.strip()
            # Skip comments/empty lines
            if not line or line.startswith('#'):
                continue

            try:
                match = rule_pattern.search(line)
                if match:
                    suri_rule_msg = match.group('msg')
                    suri_rule_classtype = match.group('classtype')
                    suri_rule_id = match.group('sid')

                    extracted_data.append({
                        "suri_rule_id": suri_rule_id,
                        "suri_rule_classtype": suri_rule_classtype,
                        "suri_rule_msg": suri_rule_msg
                    })
                else:
                    print(f"Warning: Line {line_number} in '{rules_file_path}' "
                          f"does not match expected format.")
            except Exception as e:
                print(f"Error processing line {line_number} in '{rules_file_path}': {e}")

    return extracted_data


def parse_all_rules_in_directory(rules_folder_path):
    """
    Parses all .rules files within the specified folder and aggregates the data.
    :param rules_folder_path: Path to the folder containing .rules files
    :return: A list of extracted rule dictionaries (with file_name included)
    """
    all_extracted_rules = []

    # List all files in the folder
    for filename in os.listdir(rules_folder_path):
        if filename.endswith('.rules'):
            rules_file_path = os.path.join(rules_folder_path, filename)
            print(f"Parsing rules from file: {rules_file_path}")

            file_rules = parse_rules_file(rules_file_path)
            # Tag each rule with the file_name
            for rule in file_rules:
                rule["file_name"] = filename

            all_extracted_rules.extend(file_rules)

    return all_extracted_rules


def save_to_json(data, output_file_path):
    """
    Saves extracted data to a JSON file
    :param data: List of extracted rule data
    :param output_file_path: Path to the output JSON file
    """
    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json.dump(data, json_file, indent=4)
    print(f"\nExtracted rules saved to: {output_file_path}")


if __name__ == "__main__":
    # Folder containing the .rules files
    rules_folder = "rules"  # or "/content/rules" if in Google Colab, etc.

    # Output JSON file path
    output_file_path = "testData.json"

    # Parse all .rules files in the folder
    print(f"Scanning for .rules files in: {rules_folder}")
    extracted_rules = parse_all_rules_in_directory(rules_folder)

    # Save to JSON
    save_to_json(extracted_rules, output_file_path)

    # Print summary
    print(f"Total .rules files processed: "
          f"{len([f for f in os.listdir(rules_folder) if f.endswith('.rules')])}")
    print(f"Total rules extracted: {len(extracted_rules)}")


Scanning for .rules files in: rules


FileNotFoundError: [Errno 2] No such file or directory: 'rules'

In [49]:
# Set OpenAI API key directly
import os
os.environ['OPENAI_API_KEY'] = ''

In [35]:
URL="" #p[inecone url]
import os

# Replace these strings with your actual API keys
os.environ["PINECONE_API_KEY"] = ""
os.environ["OPENAI_KEY"] = ""
# Optionally print to verify they are set (optional, remove in production)
print("PINECONE_API_KEY:", os.environ.get("PINECONE_API_KEY", "Not found"))
print("OPENAI_KEY:", os.environ.get("OPENAI_KEY", "Not found"))



PINECONE_API_KEY: pcsk_4iJy6B_4jDRvxB4cU9yVd8LHkZNZmqEMwoKkDVdzPEzuPycBfJDpvgbek6TBGKiozbjDwq
OPENAI_KEY: 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


In [23]:
!pip install pinecone-client langchain_community unittest


[31mERROR: Could not find a version that satisfies the requirement unittest (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for unittest[0m[31m
[0m

In [24]:
!pip install langchain_community



In [27]:
!pip install tiktoken pinecone openai



In [46]:
import unittest
import time

class TestExperienceManagerAndVectorDB(unittest.TestCase):

    def setUp(self):
        self.project_name = "testautoattackerproject"  # Must be lower-case, no special chars
        self.vectordb_name = "test_autoattacker_vectordb"
        # self.experience_manager = ExperienceManager(project_name=self.project_name,
        #                                             vectordb_name=self.vectordb_name)
        self.vector_db = customVectorDB(project_name=self.project_name,
                                        vectordb_name=self.vectordb_name)

    def tearDown(self):
        # Optionally delete the index after tests
        self.vector_db.delete_index()
        pass

    def test_store_and_retrieve_text(self):
        test_text = "This is a test text for vector database."
        self.vector_db.store_text(test_text)
        time.sleep(5)  # Wait for indexing
        results = self.vector_db.retrieval("test text")
        print("Stored Text Retrieval Results:", results)
        self.assertGreater(len(results), 0)
        if results:
            self.assertIn("This is a test text for vector database.", results[0].page_content)

    def test_store_and_retrieve_file(self):
        test_file_content = "This is a test file content for vector database."
        with open("test_file.txt", "w") as f:
            f.write(test_file_content)

        self.vector_db.store_file("test_file.txt")
        time.sleep(5)
        results = self.vector_db.retrieval("test file content")
        print("Stored File Retrieval Results:", results)
        self.assertGreater(len(results), 0)
        if results:
            self.assertIn("This is a test file content for vector database.", results[0].page_content)

    # def test_store_and_retrieve_experience(self):
    #     test_action_plan = "Execute shell command to escalate privileges."
    #     self.experience_manager.store_experience(test_action_plan, metadata=None)
    #     time.sleep(5)
    #     results = self.experience_manager.retrieve_experiences("escalate privileges", top_k=1)
    #     print("Stored Experience Retrieval Results:", results)
    #     self.assertGreater(len(results), 0)
    #     if results:
    #         self.assertIn("Execute shell command to escalate privileges.", results[0].page_content)


# Run the tests
suite = unittest.TestLoader().loadTestsFromTestCase(TestExperienceManagerAndVectorDB)
unittest.TextTestRunner(verbosity=2).run(suite)


test_store_and_retrieve_file (__main__.TestExperienceManagerAndVectorDB.test_store_and_retrieve_file) ... 

Stored File Retrieval Results: [Document(metadata={}, page_content='This is a test file content for vector database.'), Document(metadata={'technique_id': 'T1595', 'technique_name': 'Active Scanning'}, page_content='Technique ID: T1595\nTechnique Name: Active Scanning\n\nAdversaries may execute active reconnaissance scans to gather information that can be used during targeting. Active scans are those where the adversary probes victim infrastructure via network traffic, as opposed to other forms of reconnaissance that do not involve direct interaction.'), Document(metadata={'technique_id': 'T1595', 'technique_name': 'Active Scanning'}, page_content='Technique ID: T1595\nTechnique Name: Active Scanning\n\nAdversaries may execute active reconnaissance scans to gather information that can be used during targeting. Active scans are those where the adversary probes victim infrastructure via network traffic, as opposed to other forms of reconnaissance that do not involve direct interaction.')

ok
test_store_and_retrieve_text (__main__.TestExperienceManagerAndVectorDB.test_store_and_retrieve_text) ... FAIL


Stored Text Retrieval Results: []



FAIL: test_store_and_retrieve_text (__main__.TestExperienceManagerAndVectorDB.test_store_and_retrieve_text)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-46-4154f70b3f34>", line 25, in test_store_and_retrieve_text
    self.assertGreater(len(results), 0)
AssertionError: 0 not greater than 0

----------------------------------------------------------------------
Ran 2 tests in 52.006s

FAILED (failures=1)


<unittest.runner.TextTestResult run=2 errors=0 failures=1>

In [47]:
import os
import uuid
import pinecone
import time
import json
from typing import List

# Install necessary packages (if not already installed)
!pip install pinecone-client langchain_community

# Set environment variables (replace with your actual keys)
# os.environ["PINECONE_API_KEY"] = "YOUR_PINECONE_API_KEY"
# os.environ["OPENAI_KEY"] = "YOUR_OPENAI_API_KEY"

# LangChain-related imports
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Pinecone

from pinecone import Pinecone as PineconeClient, ServerlessSpec


class customVectorDB:
    """
    The custom VectorDB implementation behind Pinecone to support the chatbot.
    """

    def __init__(self, project_name: str, vectordb_name: str):
        assert project_name != "", "Project name cannot be empty."
        self.project_name = project_name

        # Load environment variables
        pinecone_api_key = os.getenv("PINECONE_API_KEY", "")
        os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_KEY", "")

        # In Colab, create a local directory if you wish.
        self.vectordb_directory = vectordb_name
        if not os.path.exists(self.vectordb_directory):
            os.mkdir(self.vectordb_directory)

        self.uuid = str(uuid.uuid4())
        self.local_context_directory = os.path.join(
            self.vectordb_directory, self.project_name + "_" + self.uuid
        )
        if not os.path.exists(self.local_context_directory):
            os.mkdir(self.local_context_directory)

        # Initialize Pinecone
        self.pinecone_instance = PineconeClient(api_key=pinecone_api_key)
        existing_indexes = self.pinecone_instance.list_indexes().names()

        if self.project_name not in existing_indexes:
            self.pinecone_instance.create_index(
                name=self.project_name,
                dimension=1536,  # 'text-embedding-ada-002' uses 1536 dimensions
                metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1")
            )

        self.vectorDB = Pinecone.from_existing_index(
            self.project_name,
            OpenAIEmbeddings()
        )

    def __del__(self):
        pass

    def _save_text(self, _text: str) -> str:
        filename = str(uuid.uuid4()) + ".txt"
        file_path = os.path.join(self.local_context_directory, filename)
        with open(file_path, "w") as f:
            f.write(_text)
        return file_path

    def store_file(self, filename: str, metadata: List[dict] = None):
        loader = TextLoader(filename)
        documents = loader.load()
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        texts = text_splitter.split_documents(documents)
        string_texts = [t.page_content for t in texts]
        self.vectorDB.add_texts(
            texts=string_texts,
            metadatas=metadata if metadata else None
        )

    def store_text(self, content: str, metadata: List[dict] = None):
        filename = self._save_text(content)
        self.store_file(filename, metadata=metadata)

    def retrieval(self, keyword: str, metadata: List[dict] = None) -> List[dict]:
        return self.vectorDB.similarity_search(keyword)

    def delete_index(self):
        self.pinecone_instance.delete_index(name=self.project_name)


class ExperienceManager:
    def __init__(self, project_name: str, vectordb_name: str):
        self.vector_db = customVectorDB(project_name, vectordb_name)

    def store_experience(self, action_plan: str, metadata: List[dict] = None):
        self.vector_db.store_text(action_plan, metadata=metadata)

    def retrieve_experiences(self, query: str, top_k: int = 3) -> List[dict]:
        results = self.vector_db.retrieval(query)
        return results[:top_k]


# Instantiate your customVectorDB and ExperienceManager
project_name = "testautoattackerproject"
vectordb_name = "test_autoattacker_vectorDB"

vector_db = customVectorDB(project_name, vectordb_name)  # if you want direct access
manager = ExperienceManager(project_name, vectordb_name) # if you prefer the manager
json_file_path = "techniques.json"  # The file you uploaded
push_techniques_to_experience_manager(json_file_path, manager)

# Wait a few seconds for indexing if you want to test retrieval immediately
time.sleep(5)

# Example retrieval
query = "Password Filter DLL"
results = manager.retrieve_experiences(query)
print(f"Results for query: '{query}'")
for idx, doc in enumerate(results):
    print(f"\nResult {idx+1} Content:\n", doc.page_content)
    print("Metadata:", doc.metadata)


Results for query: 'Password Filter DLL'

Result 1 Content:
 Technique ID: T1201
Technique Name: Password Policy Discovery

Adversaries may attempt to access detailed information about the password policy used within an enterprise network or cloud environment. Password policies are a way to enforce complex passwords that are difficult to guess or crack through Brute Force. This information may help the adversary to create a list of common passwords and launch dictionary and/or brute force attacks which adheres to the policy (e.g. if the minimum password length should be 8, then not trying passwords such as 'pass123'; not checking for more than 3-4 passwords per account if the lockout is set to 6 as to not lock out accounts).
Metadata: {'technique_id': 'T1201', 'technique_name': 'Password Policy Discovery'}

Result 2 Content:
 Technique ID: T1555
Technique Name: Credentials from Password Stores

Adversaries may search for common password storage locations to obtain user credentials. Pas

In [48]:
query = "Password Filter DLL"
results = manager.retrieve_experiences(query)
for idx, doc in enumerate(results):
    print(f"\nResult {idx+1} Content:\n", doc.page_content)
    print("Metadata:", doc.metadata)



Result 1 Content:
 Technique ID: T1201
Technique Name: Password Policy Discovery

Adversaries may attempt to access detailed information about the password policy used within an enterprise network or cloud environment. Password policies are a way to enforce complex passwords that are difficult to guess or crack through Brute Force. This information may help the adversary to create a list of common passwords and launch dictionary and/or brute force attacks which adheres to the policy (e.g. if the minimum password length should be 8, then not trying passwords such as 'pass123'; not checking for more than 3-4 passwords per account if the lockout is set to 6 as to not lock out accounts).
Metadata: {'technique_id': 'T1201', 'technique_name': 'Password Policy Discovery'}

Result 2 Content:
 Technique ID: T1555
Technique Name: Credentials from Password Stores

Adversaries may search for common password storage locations to obtain user credentials. Passwords are stored in several places on a 

In [42]:
def push_techniques_to_experience_manager(json_file_path: str, manager: ExperienceManager):
    with open(json_file_path, "r") as f:
        techniques = json.load(f)

    for technique in techniques:
        # Combine fields into a single text block
        text_content = (
            f"Technique ID: {technique.get('technique_id', '')}\n"
            f"Technique Name: {technique.get('technique_name', '')}\n\n"
            f"{technique.get('technique_description', '')}"
        )

        # Optional: store some metadata as a list of dict or just a single dict
        metadata = [{
            "technique_id": technique.get("technique_id", ""),
            "technique_name": technique.get("technique_name", ""),
        }]

        # Store via ExperienceManager
        manager.store_experience(text_content, metadata=metadata)


In [None]:
#I get the .csv forom suricaa_Rule enhancer.py script and then uplaod that here.
#upload the MITRE Techniques.json file as well techniques.json
#In case the technique ID is not present in the Hashmap it can requery the llm.

In [None]:
# import json
# import os
# import csv
# import re
# import openai
# from collections import OrderedDict
# from google.colab import files  # Google Colab auto-download

# # OpenAI Client
# client = openai
# client.api_key = os.getenv("OPENAI_API_KEY")

# # Cache to avoid redundant API calls
# cache = {}

# # Regex for extracting msg and classtype
# MSG_REGEX = re.compile(r'msg:"([^"]+)"', re.IGNORECASE)
# CLASSTYPE_REGEX = re.compile(r'classtype:([^;]+);', re.IGNORECASE)

# # Constants
# BATCH_SIZE = 2000
# SAVE_INTERVAL = 1000
# OUTPUT_DIR = "output_batches"
# TECHNIQUES_JSON_FILE = "techniques.json"
# TEST_MODE = False

# # Ensure output directory exists
# if not os.path.exists(OUTPUT_DIR):
#     os.makedirs(OUTPUT_DIR)

# # Load MITRE ATT&CK Techniques
# def load_mitre_techniques():
#     with open(TECHNIQUES_JSON_FILE, "r", encoding="utf-8") as f:
#         techniques = json.load(f)
#     return {t["technique_id"] for t in techniques}  # Store only technique IDs for fast lookup

# MITRE_TECHNIQUES = load_mitre_techniques()

# # Read processed rule IDs from existing JSON files
# def get_processed_rule_ids():
#     processed_ids = set()
#     for filename in os.listdir(OUTPUT_DIR):
#         if filename.endswith(".json"):
#             with open(os.path.join(OUTPUT_DIR, filename), "r", encoding="utf-8") as f:
#                 try:
#                     data = json.load(f)
#                     processed_ids.update(entry["suri_rule_id"] for entry in data)
#                 except json.JSONDecodeError:
#                     print(f"⚠️ Warning: Corrupted JSON file {filename}. Skipping.")
#     return processed_ids

# # Create LLM prompt for single classification
# def create_prompt_for_single_classification(rule):
#     return f"""
# Map the following Suricata IDS rule to a **single MITRE ATT&CK technique** with a confidence score (rounded to 2 decimal places).

# ### Rule:
# ID: {rule["suri_rule_id"]}
# File Name: {rule["file_name"]}
# Action: {rule["action"]}
# Protocol: {rule["protocol"]}
# Source: {rule["src_addr"]}:{rule["src_port"]}
# Destination: {rule["dst_addr"]}:{rule["dst_port"]}
# Options: {rule["options"]}
# Classification: {rule["suri_rule_classtype"]}
# Message: "{rule["suri_rule_msg"]}"

# Respond in valid JSON format:
# {{
#   "mitre_technique_id": "<Technique ID>",
#   "mitre_technique_name": "<Technique Name>",
#   "confidence_score": "<Confidence Score (rounded to 2 decimal places)>"
# }}
# """.strip()

# # Query the LLM
# def query_llm(prompt, use_gpt4=False):
#     model_name = "gpt-4" if use_gpt4 else "gpt-3.5-turbo"
#     system_message = (
#         "You are a cybersecurity expert. Your task is to map Suricata IDS rules "
#         "to MITRE ATT&CK techniques, providing a confidence score (rounded to 2 decimal places)."
#     )

#     response = client.chat.completions.create(
#         model=model_name,
#         messages=[
#             {"role": "system", "content": system_message},
#             {"role": "user", "content": prompt},
#         ],
#         max_tokens=400,
#         temperature=0.0
#     )
#     return response

# # Validate technique, if invalid re-query LLM
# def validate_or_requery_technique(rule, mapping, use_gpt4=False):
#     """
#     If the technique ID is not found in MITRE_TECHNIQUES, we **re-query the LLM** with the same prompt.
#     """
#     if mapping["mitre_technique_id"] in MITRE_TECHNIQUES:
#         return mapping  # If it's valid, return it as is.

#     print(f"⚠️ Invalid MITRE Technique: {mapping['mitre_technique_id']} → Re-querying LLM...")

#     # Re-run the same LLM query to get a better technique match
#     prompt = create_prompt_for_single_classification(rule)
#     response = query_llm(prompt, use_gpt4=use_gpt4)
#     response_text = response.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()

#     try:
#         corrected_mapping = json.loads(response_text)
#         return corrected_mapping  # Return the corrected technique
#     except json.JSONDecodeError:
#         print(f"⚠️ Error: Unable to parse corrected technique for rule {rule['suri_rule_id']}. Skipping...")
#         return None  # Skip this rule if re-querying fails.

# # Process CSV and map rules
# def process_csv_and_map_to_mitre(csv_input_file, use_gpt4=False):
#     processed_rule_ids = get_processed_rule_ids()
#     rules = []

#     with open(csv_input_file, "r", encoding="utf-8") as f:
#         reader = csv.DictReader(f)
#         for i, row in enumerate(reader, start=1):
#             suri_rule_id = f"{row['file_name']}_{i}"
#             if suri_rule_id in processed_rule_ids:
#                 continue

#             options = row["options"]
#             msg_match = MSG_REGEX.search(options)
#             suri_rule_msg = msg_match.group(1).strip() if msg_match else "(no msg found)"

#             classtype_match = CLASSTYPE_REGEX.search(options)
#             suri_rule_classtype = classtype_match.group(1).strip() if classtype_match else "(no classtype found)"

#             rule_dict = {
#                 "suri_rule_id": suri_rule_id,
#                 "suri_rule_classtype": suri_rule_classtype,
#                 "suri_rule_msg": suri_rule_msg,
#                 "file_name": row["file_name"],
#                 "action": row["action"],
#                 "protocol": row["protocol"],
#                 "src_addr": row["src_addr"],
#                 "src_port": row["src_port"],
#                 "dst_addr": row["dst_addr"],
#                 "dst_port": row["dst_port"],
#                 "options": options
#             }
#             rules.append(rule_dict)

#     if TEST_MODE:
#         rules = rules[:10]

#     batch_count = 0
#     processed_results = []

#     for idx, rule in enumerate(rules, start=1):
#         prompt = create_prompt_for_single_classification(rule)
#         response = query_llm(prompt, use_gpt4=use_gpt4)
#         response_text = response.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()

#         try:
#             mapping = json.loads(response_text)
#             valid_mapping = validate_or_requery_technique(rule, mapping, use_gpt4)

#             if valid_mapping:
#               processed_results.append({
#                   "suri_rule_id": rule["suri_rule_id"],
#                   "suri_rule_classtype": rule["suri_rule_classtype"],
#                   "suri_rule_msg": rule["suri_rule_msg"],
#                   "file_name": rule["file_name"],
#                   "action": rule["action"],
#                   "protocol": rule["protocol"],
#                   "src_addr": rule["src_addr"],
#                   "src_port": rule["src_port"],
#                   "dst_addr": rule["dst_addr"],
#                   "dst_port": rule["dst_port"],
#                   "options": rule["options"],

#                   # Mapped MITRE Technique from LLM
#                   "mitre_technique_id": valid_mapping["mitre_technique_id"],
#                   "mitre_technique_name": valid_mapping["mitre_technique_name"],
#                   "confidence_score": valid_mapping["confidence_score"]
#               })
#         except json.JSONDecodeError:
#             print(f"⚠️ Skipping rule {rule['suri_rule_id']} due to invalid LLM response.")

#         if idx % SAVE_INTERVAL == 0 or idx == len(rules):
#             batch_count += 1
#             batch_output_file = os.path.join(OUTPUT_DIR, f"mapped_results_batch_{batch_count}.json")

#             with open(batch_output_file, "w", encoding="utf-8") as f_out:
#                 json.dump(processed_results, f_out, indent=4)

#             print(f"✅ Saved batch {batch_count} ({idx} rules) to {batch_output_file}")
#             files.download(batch_output_file)
#             processed_results = []

# if __name__ == "__main__":
#     process_csv_and_map_to_mitre("suricata_extracted_rules_parsed.csv", use_gpt4=False)


⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1107 → Re-querying LLM...
✅ Saved batch 1 (1000 rules) to output_batches/mapped_results_batch_1.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Saved batch 2 (2000 rules) to output_batches/mapped_results_batch_2.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ev_args = tuple(_eval_type(a, globalns, localns, recursive_guard) for a in t.__args__)


✅ Saved batch 3 (3000 rules) to output_batches/mapped_results_batch_3.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1500 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1506 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1506 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1506 → Re-querying LLM...
✅ Saved batch 4 (4000 rules) to output_batches/mapped_results_batch_4.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1107 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1100 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1506 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1059.003 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1059.001 → Re-querying LLM...
⚠️

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⚠️ Invalid MITRE Technique: T1073 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1022 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1024 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM...
⚠️ Invalid

KeyboardInterrupt: 

In [None]:
# import json
# import os
# import csv
# import re
# import openai
# from google.colab import files  # Google Colab auto-download

# # OpenAI Client
# client = openai
# client.api_key = os.getenv("OPENAI_API_KEY")

# # Cache to avoid redundant API calls
# cache = {}

# # Regex for extracting msg and classtype
# MSG_REGEX = re.compile(r'msg:"([^"]+)"', re.IGNORECASE)
# CLASSTYPE_REGEX = re.compile(r'classtype:([^;]+);', re.IGNORECASE)

# # Constants
# BATCH_SIZE = 2000
# SAVE_INTERVAL = 1000
# PROGRESS_UPDATE_INTERVAL = 50
# START_FROM_RULE = 5001
# OUTPUT_DIR = "output_batches"
# TECHNIQUES_JSON_FILE = "techniques.json"
# TEST_MODE = False

# # Ensure output directory exists
# if not os.path.exists(OUTPUT_DIR):
#     os.makedirs(OUTPUT_DIR)

# # Load MITRE ATT&CK Techniques
# def load_mitre_techniques():
#     with open(TECHNIQUES_JSON_FILE, "r", encoding="utf-8") as f:
#         techniques = json.load(f)
#     return {t["technique_id"]: t for t in techniques}  # Store technique details

# MITRE_TECHNIQUES = load_mitre_techniques()

# # Read processed rule IDs from existing JSON files
# def get_processed_rule_ids():
#     processed_ids = set()
#     for filename in os.listdir(OUTPUT_DIR):
#         if filename.endswith(".json"):
#             with open(os.path.join(OUTPUT_DIR, filename), "r", encoding="utf-8") as f:
#                 try:
#                     data = json.load(f)
#                     processed_ids.update(entry["suri_rule_id"] for entry in data)
#                 except json.JSONDecodeError:
#                     print(f"⚠️ Warning: Corrupted JSON file {filename}. Skipping.")
#     return processed_ids

# # Create LLM prompt for single classification
# def create_prompt_for_single_classification(rule, invalid_technique=None):
#     """
#     Generates a prompt for the LLM.
#     If retrying, it includes the previously invalid technique and asks the LLM to correct it.
#     """
#     correction_note = ""
#     if invalid_technique:
#         correction_note = f"""
# The previous technique suggestion was INVALID:
# - Technique ID: {invalid_technique["mitre_technique_id"]}
# - Technique Name: {invalid_technique["mitre_technique_name"]}

# Please select a VALID technique from the lates 2023 MITRE ATT&CK framework.
# """

#     return f"""
# Map the following Suricata IDS rule to a **single MITRE ATT&CK technique** with a confidence score (rounded to 2 decimal places).

# {correction_note}

# ### Rule:
# ID: {rule["suri_rule_id"]}
# File Name: {rule["file_name"]}
# Action: {rule["action"]}
# Protocol: {rule["protocol"]}
# Source: {rule["src_addr"]}:{rule["src_port"]}
# Destination: {rule["dst_addr"]}:{rule["dst_port"]}
# Options: {rule["options"]}
# Classification: {rule["suri_rule_classtype"]}
# Message: "{rule["suri_rule_msg"]}"

# Respond in valid JSON format:
# {{
#   "mitre_technique_id": "<Technique ID>",
#   "mitre_technique_name": "<Technique Name>",
#   "confidence_score": "<Confidence Score (rounded to 2 decimal places)>"
# }}
# """.strip()

# # Query the LLM
# def query_llm(prompt, use_gpt4=False):
#     model_name = "gpt-4" if use_gpt4 else "gpt-3.5-turbo"
#     system_message = (
#         "You are a cybersecurity expert. Your task is to map Suricata IDS rules "
#         "to MITRE ATT&CK techniques, ensuring the technique is valid."
#     )

#     response = client.chat.completions.create(
#         model=model_name,
#         messages=[
#             {"role": "system", "content": system_message},
#             {"role": "user", "content": prompt},
#         ],
#         max_tokens=400,
#         temperature=0.0
#     )
#     return response

# # Get nearest embedding match from Pinecone
# def get_nearest_mitre_match(technique_name, manager):
#     """
#     Queries Pinecone for the closest matching MITRE technique based on embeddings.
#     """
#     try:
#         query = technique_name
#         results = manager.retrieve_experiences(query)

#         if results:
#             top_match = results[0]
#             return {
#                 "mitre_technique_id": top_match.metadata["technique_id"],
#                 "mitre_technique_name": top_match.metadata["technique_name"],
#                 "confidence_score": "0.80"  # Assign a fixed confidence score for Pinecone match
#             }
#     except Exception as e:
#         print(f"⚠️ Error querying Pinecone: {e}")

#     return None  # Return None if no match found

# # Validate technique, retry LLM twice with invalid technique info, then fallback to Pinecone
# def validate_or_fallback(rule, mapping, manager, use_gpt4=False):
#     """
#     Step 1: Validate the first response.
#     Step 2: If invalid, re-query the LLM **twice** with the incorrect technique passed in the prompt.
#     Step 3: If still invalid, use Pinecone nearest match.
#     """
#     if mapping["mitre_technique_id"] in MITRE_TECHNIQUES:
#         return mapping  # If valid, return immediately.

#     print(f"⚠️ Invalid MITRE Technique: {mapping['mitre_technique_id']} → Re-querying LLM (Attempt 2)...")

#     # First retry with LLM
#     prompt = create_prompt_for_single_classification(rule, invalid_technique=mapping)
#     response = query_llm(prompt, use_gpt4=use_gpt4)
#     response_text = response.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()

#     try:
#         second_attempt_mapping = json.loads(response_text)

#         # Check second attempt validity
#         if second_attempt_mapping["mitre_technique_id"] in MITRE_TECHNIQUES:
#             return second_attempt_mapping  # If valid, return it.

#     except json.JSONDecodeError:
#         print(f"⚠️ LLM failed second attempt for rule {rule['suri_rule_id']}.")

#     # Final retry with LLM before using Pinecone
#     print(f"⚠️ LLM failed twice → Re-querying one last time...")
#     prompt = create_prompt_for_single_classification(rule, invalid_technique=second_attempt_mapping)
#     response = query_llm(prompt, use_gpt4=use_gpt4)
#     response_text = response.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()

#     try:
#         third_attempt_mapping = json.loads(response_text)

#         if third_attempt_mapping["mitre_technique_id"] in MITRE_TECHNIQUES:
#             return third_attempt_mapping  # If valid, return it.

#     except json.JSONDecodeError:
#         print(f"⚠️ LLM failed third attempt for rule {rule['suri_rule_id']}.")

#     # Fallback to Pinecone
#     print(f"⚠️ LLM failed three times → Using Pinecone for nearest match...")
#     return get_nearest_mitre_match(mapping["mitre_technique_name"], manager)

# if __name__ == "__main__":
#     from langchain_community.embeddings import OpenAIEmbeddings
#     from langchain_community.vectorstores import Pinecone
#     from pinecone import Pinecone as PineconeClient

#     # Initialize Pinecone Experience Manager
#     project_name = "testautoattackerproject"
#     vectordb_name = "test_autoattacker_vectordb"
#     manager = ExperienceManager(project_name, vectordb_name)

#     process_csv_and_map_to_mitre("suricata_extracted_rules_parsed.csv", manager, use_gpt4=False)



⚠️ Invalid MITRE Technique: T1073 → Re-querying LLM (Attempt 2)...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM (Attempt 2)...
⚠️ LLM failed twice → Re-querying one last time...
⚠️ LLM failed three times → Using Pinecone for nearest match...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM (Attempt 2)...
⚠️ LLM failed twice → Re-querying one last time...
⚠️ LLM failed three times → Using Pinecone for nearest match...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM (Attempt 2)...
⚠️ LLM failed twice → Re-querying one last time...
⚠️ LLM failed three times → Using Pinecone for nearest match...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM (Attempt 2)...
⚠️ LLM failed twice → Re-querying one last time...
⚠️ LLM failed three times → Using Pinecone for nearest match...
⚠️ Invalid MITRE Technique: T1043 → Re-querying LLM (Attempt 2)...
⚠️ LLM failed twice → Re-querying one last time...
⚠️ LLM failed three times → Using Pinecone for nearest match...
⚠️ Invalid MITRE Techni

KeyboardInterrupt: 

In [52]:
import json
import os
import csv
import re
import openai
from google.colab import files  # Google Colab auto-download

# OpenAI Client
client = openai
client.api_key = os.getenv("OPENAI_API_KEY")

# Constants
BATCH_SIZE = 2000
SAVE_INTERVAL = 1000
PROGRESS_UPDATE_INTERVAL = 50
START_FROM_RULE = 34001
OUTPUT_DIR = "output_batches"
TECHNIQUES_JSON_FILE = "techniques.json"
TEST_MODE = False

# Ensure output directory exists
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Load MITRE ATT&CK Techniques (all techniques)
def load_mitre_techniques():
    with open(TECHNIQUES_JSON_FILE, "r", encoding="utf-8") as f:
        techniques = json.load(f)

    # Store only technique IDs for quick lookup
    technique_dict = {t["technique_id"]: t for t in techniques}
    return technique_dict, techniques  # Returning the full list instead of chunks

MITRE_TECHNIQUES, ALL_TECHNIQUES = load_mitre_techniques()

# Read processed rule IDs
def get_processed_rule_ids():
    processed_ids = set()
    for filename in os.listdir(OUTPUT_DIR):
        if filename.endswith(".json"):
            with open(os.path.join(OUTPUT_DIR, filename), "r", encoding="utf-8") as f:
                try:
                    data = json.load(f)
                    processed_ids.update(entry["suri_rule_id"] for entry in data)
                except json.JSONDecodeError:
                    print(f"⚠️ Warning: Corrupted JSON file {filename}. Skipping.")
    return processed_ids

# Create LLM prompt with all techniques
def create_prompt_for_classification(rule, invalid_technique=None):
    """
    Generates a classification prompt with ALL MITRE techniques.
    """
    correction_note = ""
    if invalid_technique:
        correction_note = f"""
The previous technique suggestion was INVALID:
- Technique ID: {invalid_technique["mitre_technique_id"]}
- Technique Name: {invalid_technique["mitre_technique_name"]}

Please select a **valid** technique **ONLY from the list below**.
"""

    # Format all techniques for the prompt
    technique_list = "\n".join([
        f"- {t['technique_id']} ({t['technique_name']})" for t in ALL_TECHNIQUES
    ])

    return f"""
Map the following Suricata IDS rule to a **single MITRE ATT&CK technique** with a confidence score.

{correction_note}

### Rule:
ID: {rule["suri_rule_id"]}
File Name: {rule["file_name"]}
Classification: {rule["suri_rule_classtype"]}
Message: "{rule["suri_rule_msg"]}"

### **ONLY SELECT FROM THESE TECHNIQUES**:
{technique_list}

Respond in valid JSON format:
{{
  "mitre_technique_id": "<Technique ID>",
  "mitre_technique_name": "<Technique Name>",
  "confidence_score": "<Confidence Score (rounded to 2 decimal places)>"
}}
""".strip()

# Query the LLM
def query_llm(prompt, use_gpt4=False):
    model_name = "gpt-4" if use_gpt4 else "gpt-3.5-turbo"
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1000,  # Allow up to 1000 tokens for the response
        temperature=0.0
    )
    return response

# Validate, Retry, then Query Pinecone
def validate_or_fallback(rule, mapping, manager, use_gpt4=False):
    """
    Step 1: Check if technique is valid.
    Step 2: If invalid, retry LLM with ALL techniques.
    Step 3: If LLM fails twice, fallback to Pinecone.
    """
    if mapping["mitre_technique_id"] in MITRE_TECHNIQUES:
        return mapping  # If valid, return immediately.

    print(f"⚠️ Invalid MITRE Technique: {mapping['mitre_technique_id']} → Re-querying LLM with ALL techniques...")

    # Retry with ALL techniques included
    prompt = create_prompt_for_classification(rule, invalid_technique=mapping)
    response = query_llm(prompt, use_gpt4=use_gpt4)
    response_text = response.choices[0].message.content.strip().replace("```json", "").replace("```", "").strip()

    try:
        second_attempt_mapping = json.loads(response_text)
        if second_attempt_mapping["mitre_technique_id"] in MITRE_TECHNIQUES:
            return second_attempt_mapping  # If valid, return it.
    except json.JSONDecodeError:
        print(f"⚠️ LLM failed on second attempt for rule {rule['suri_rule_id']}.")

    # Fallback to Pinecone if LLM fails again
    print(f"⚠️ LLM failed twice → Using Pinecone for nearest match...")
    return get_nearest_mitre_match(mapping["mitre_technique_name"], manager)

# Pinecone Fallback Function
def get_nearest_mitre_match(technique_name, manager):
    try:
        results = manager.retrieve_experiences(technique_name)
        if results:
            top_match = results[0]
            return {
                "mitre_technique_id": top_match.metadata["technique_id"],
                "mitre_technique_name": top_match.metadata["technique_name"],
                "confidence_score": "0.80"
            }
    except Exception as e:
        print(f"⚠️ Error querying Pinecone: {e}")
    return {
        "mitre_technique_id": "T9999",
        "mitre_technique_name": "Unknown Technique",
        "confidence_score": "0.00"
    }

# Main Processing Pipeline
def process_csv_and_map_to_mitre(csv_file, manager, use_gpt4=False):
    processed_ids = get_processed_rule_ids()
    batch_results = []
    total_processed = 0
    batch_index = 1

    with open(csv_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader, start=1):
            if i < START_FROM_RULE:
                continue

            suri_rule_id = f"{row.get('file_name', 'unknown')}_{i}"
            if suri_rule_id in processed_ids:
                continue

            rule = {
                "suri_rule_id": suri_rule_id,
                "file_name": row.get("file_name", "N/A"),
                "action": row.get("action", "N/A"),
                "protocol": row.get("protocol", "N/A"),
                "src_addr": row.get("src_addr", "N/A"),
                "src_port": row.get("src_port", "N/A"),
                "dst_addr": row.get("dst_addr", "N/A"),
                "dst_port": row.get("dst_port", "N/A"),
                "options": row.get("options", ""),
                "suri_rule_classtype": row.get("classtype", "N/A"),
                "suri_rule_msg": row.get("msg", "N/A"),
            }

            prompt = create_prompt_for_classification(rule)
            response = query_llm(prompt, use_gpt4=use_gpt4)
            response_text = response.choices[0].message.content.strip()

            try:
                mapping = json.loads(response_text)
            except json.JSONDecodeError:
                print(f"⚠️ Invalid JSON response for rule {rule['suri_rule_id']}. Skipping.")
                continue

            corrected_mapping = validate_or_fallback(rule, mapping, manager, use_gpt4=use_gpt4)
            batch_results.append({**rule, **corrected_mapping})
            total_processed += 1

            if total_processed % PROGRESS_UPDATE_INTERVAL == 0:
                print(f"✅ Processed {total_processed} rules...")

            if total_processed % SAVE_INTERVAL == 0:
                filename = os.path.join(OUTPUT_DIR, f"partial_{batch_index}.json")
                with open(filename, "w", encoding="utf-8") as f_out:
                    json.dump(batch_results, f_out, indent=4)
                files.download(filename)
                batch_results = []
                batch_index += 1

    if batch_results:
        filename = os.path.join(OUTPUT_DIR, f"final_{batch_index}.json")
        with open(filename, "w", encoding="utf-8") as f_out:
            json.dump(batch_results, f_out, indent=4)
        files.download(filename)

if __name__ == "__main__":
    # Replace with your actual Pinecone manager
    process_csv_and_map_to_mitre("suricata_extracted_rules_parsed.csv", manager, use_gpt4=False)


✅ Processed 50 rules...
✅ Processed 100 rules...
✅ Processed 150 rules...
✅ Processed 200 rules...
✅ Processed 250 rules...
✅ Processed 300 rules...
✅ Processed 350 rules...
✅ Processed 400 rules...
✅ Processed 450 rules...
✅ Processed 500 rules...
✅ Processed 550 rules...
✅ Processed 600 rules...
✅ Processed 650 rules...
✅ Processed 700 rules...
✅ Processed 750 rules...
✅ Processed 800 rules...
✅ Processed 850 rules...
✅ Processed 900 rules...
✅ Processed 950 rules...
✅ Processed 1000 rules...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Processed 1050 rules...
✅ Processed 1100 rules...
✅ Processed 1150 rules...
✅ Processed 1200 rules...
✅ Processed 1250 rules...
✅ Processed 1300 rules...
✅ Processed 1350 rules...
✅ Processed 1400 rules...
✅ Processed 1450 rules...
✅ Processed 1500 rules...
✅ Processed 1550 rules...
✅ Processed 1600 rules...
✅ Processed 1650 rules...
✅ Processed 1700 rules...
✅ Processed 1750 rules...
✅ Processed 1800 rules...
✅ Processed 1850 rules...
✅ Processed 1900 rules...
✅ Processed 1950 rules...
✅ Processed 2000 rules...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Processed 2050 rules...
✅ Processed 2100 rules...
✅ Processed 2150 rules...
✅ Processed 2200 rules...
✅ Processed 2250 rules...
✅ Processed 2300 rules...
✅ Processed 2350 rules...
✅ Processed 2400 rules...
✅ Processed 2450 rules...
✅ Processed 2500 rules...
✅ Processed 2550 rules...
✅ Processed 2600 rules...
✅ Processed 2650 rules...
✅ Processed 2700 rules...
✅ Processed 2750 rules...
✅ Processed 2800 rules...
✅ Processed 2850 rules...
✅ Processed 2900 rules...
✅ Processed 2950 rules...
✅ Processed 3000 rules...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Processed 3050 rules...
✅ Processed 3100 rules...
✅ Processed 3150 rules...
✅ Processed 3200 rules...
✅ Processed 3250 rules...
✅ Processed 3300 rules...
✅ Processed 3350 rules...
✅ Processed 3400 rules...
✅ Processed 3450 rules...
✅ Processed 3500 rules...
✅ Processed 3550 rules...
✅ Processed 3600 rules...
✅ Processed 3650 rules...
✅ Processed 3700 rules...
✅ Processed 3750 rules...
✅ Processed 3800 rules...
✅ Processed 3850 rules...
✅ Processed 3900 rules...
✅ Processed 3950 rules...
✅ Processed 4000 rules...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Processed 4050 rules...
✅ Processed 4100 rules...
✅ Processed 4150 rules...
✅ Processed 4200 rules...
✅ Processed 4250 rules...
✅ Processed 4300 rules...
✅ Processed 4350 rules...
✅ Processed 4400 rules...
✅ Processed 4450 rules...
✅ Processed 4500 rules...
✅ Processed 4550 rules...
✅ Processed 4600 rules...
✅ Processed 4650 rules...
✅ Processed 4700 rules...
✅ Processed 4750 rules...
✅ Processed 4800 rules...
✅ Processed 4850 rules...
✅ Processed 4900 rules...
✅ Processed 4950 rules...
✅ Processed 5000 rules...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Processed 5050 rules...
✅ Processed 5100 rules...
✅ Processed 5150 rules...
✅ Processed 5200 rules...
✅ Processed 5250 rules...
✅ Processed 5300 rules...
✅ Processed 5350 rules...
✅ Processed 5400 rules...
✅ Processed 5450 rules...
✅ Processed 5500 rules...
✅ Processed 5550 rules...
✅ Processed 5600 rules...
✅ Processed 5650 rules...
✅ Processed 5700 rules...
✅ Processed 5750 rules...
✅ Processed 5800 rules...
✅ Processed 5850 rules...
✅ Processed 5900 rules...
✅ Processed 5950 rules...
✅ Processed 6000 rules...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Processed 6050 rules...
✅ Processed 6100 rules...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# import pandas as pd
# # Load JSON file
# data = pd.read_json("testDataMapped.json")
# # Save to CSV
# data.to_csv("testDataMapped.csv", index=False)

In [53]:
import json
import os

# Directory that holds your exec_*.json files
INPUT_DIR = "output_batches_final"  # or wherever your files are
# Final merged output filename
OUTPUT_FILE = "all_mapped_results.json"

def merge_json_files():
    # We'll store all results here
    merged_data = []

    # Keep track of totals
    files_processed = 0
    rules_loaded = 0

    # For convenience, list all files named exec_*.json
    # Adjust the pattern if your files are named differently.
    json_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("exec_") and f.endswith(".json")]

    # Sort them in ascending order so 'exec_0.json' comes before 'exec_41.json'
    json_files.sort()

    for filename in json_files:
        file_path = os.path.join(INPUT_DIR, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                    print(f"✔️ Merged {filename} with {len(data)} items.")
                    rules_loaded += len(data)
                else:
                    # If not a list, decide how to handle.
                    # Possibly wrap it or just append a single object.
                    merged_data.append(data)
                    print(f"✔️ Merged {filename} with 1 item (not a list).")
                    rules_loaded += 1

                files_processed += 1

        except json.JSONDecodeError:
            print(f"⚠️ Skipping file {filename} - invalid JSON.")
        except Exception as e:
            print(f"⚠️ Error reading {filename}: {e}")

    # Write out the combined data
    with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
        json.dump(merged_data, out, indent=4)

    # Final logging
    print(f"\n✅ Successfully created merged file: {OUTPUT_FILE}")
    print(f"   🔹 Total files processed: {files_processed}")
    print(f"   🔹 Total rules loaded: {rules_loaded}")
    print(f"   🔹 Total items in merged file: {len(merged_data)}")


if __name__ == "__main__":
    merge_json_files()

✔️ Merged exec_0.json with 1000 items.
✔️ Merged exec_1.json with 1000 items.
✔️ Merged exec_10.json with 1000 items.
✔️ Merged exec_11.json with 1000 items.
✔️ Merged exec_12.json with 1000 items.
✔️ Merged exec_13.json with 1000 items.
✔️ Merged exec_14.json with 1000 items.
✔️ Merged exec_15.json with 1000 items.
✔️ Merged exec_16.json with 1000 items.
✔️ Merged exec_17.json with 1000 items.
✔️ Merged exec_18.json with 1000 items.
✔️ Merged exec_19.json with 1000 items.
✔️ Merged exec_2.json with 1000 items.
✔️ Merged exec_20.json with 1000 items.
✔️ Merged exec_21.json with 1000 items.
✔️ Merged exec_22.json with 1000 items.
✔️ Merged exec_23.json with 1000 items.
✔️ Merged exec_24.json with 1000 items.
✔️ Merged exec_25.json with 1000 items.
✔️ Merged exec_26.json with 1000 items.
✔️ Merged exec_27.json with 1000 items.
✔️ Merged exec_28.json with 1000 items.
✔️ Merged exec_29.json with 1000 items.
✔️ Merged exec_3.json with 1000 items.
✔️ Merged exec_30.json with 1000 items.
✔️ M

previous code