In [None]:
!pip install langchain-experimental langchain-community langchain networkx langchain-google-genai langchain-core json-repair tiktoken

Collecting langchain-experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.16-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.9-py3-none-any.whl.metadata (3.6 kB)
Collecting json-repair
  Downloading json_repair-0.35.0-py3-none-any.whl.metadata (11 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langc

In [None]:
import re
import json
import os

def parse_rules_file(rules_file_path):
    """
    Parses a Suricata rules file and extracts relevant information
    :param rules_file_path: Path to the rules file
    :return: A list of dictionaries with extracted rule information
    """
    extracted_data = []
    # Regex pattern to extract msg, classtype, and sid
    rule_pattern = re.compile(
        r'msg:"(?P<msg>.*?)";.*?classtype:(?P<classtype>[^;]+);.*?sid:(?P<sid>\d+);'
    )

    with open(rules_file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, 1):
            line = line.strip()
            # Skip comments and empty lines
            if not line or line.startswith('#'):
                continue

            try:
                match = rule_pattern.search(line)
                if match:
                    suri_rule_msg = match.group('msg')
                    suri_rule_classtype = match.group('classtype')
                    suri_rule_id = match.group('sid')

                    # Append to extracted data
                    extracted_data.append({
                        "suri_rule_id": suri_rule_id,
                        "suri_rule_classtype": suri_rule_classtype,
                        "suri_rule_msg": suri_rule_msg
                    })
                else:
                    print(f"Warning: Line {line_number} does not match expected format.")
            except Exception as e:
                print(f"Error processing line {line_number}: {e}")
    return extracted_data

def save_to_json(data, output_file_path):
    """
    Saves extracted data to a JSON file
    :param data: List of extracted rule data
    :param output_file_path: Path to the output JSON file
    """
    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json.dump(data, json_file, indent=4)
    print(f"Extracted rules saved to {output_file_path}")

if __name__ == "__main__":
    # Input rules file path
    rules_file_path = "testData.rules"  # Replace with your .rules file path

    # Output JSON file path
    output_file_path = "testData.json"

    # Parse rules file
    print(f"Parsing rules file: {rules_file_path}")
    extracted_rules = parse_rules_file(rules_file_path)

    # Save to JSON file
    save_to_json(extracted_rules, output_file_path)

    # Print summary
    print(f"Total rules extracted: {len(extracted_rules)}")


Parsing rules file: testData.rules
Extracted rules saved to testData.json
Total rules extracted: 515


In [None]:
# Set OpenAI API key directly
import os
os.environ['OPENAI_API_KEY'] = ''

In [None]:
import json
import os
import csv
import time
import re
import openai
from collections import OrderedDict

#############################################
# 1) SETUP: Client, API Key, and Cache
#############################################

client = openai  # or your custom client object
client.api_key = os.getenv("OPENAI_API_KEY")

# Cache to avoid calling the model multiple times for the same rule message
cache = {}

#############################################
# 2) REGEX for extracting msg and classtype
#############################################

MSG_REGEX = re.compile(r'msg:"([^"]+)"', re.IGNORECASE)
CLASSTYPE_REGEX = re.compile(r'classtype:([^;]+);', re.IGNORECASE)

#############################################
# 3) PROMPT CREATION
#############################################

def create_prompt_with_confidence(rule):
    """
    Return a user prompt instructing the LLM to output
    three MITRE techniques + a confidence score rounded upto 2 decimal places for each.
    We include all relevant fields in the text for context.
    """
    return f"""
Map the following Suricata IDS rule to three MITRE ATT&CK techniques,
and provide a confidence score for each technique 0-1 rounded upto 2 decimal places for each.

### Rule:
ID: {rule["suri_rule_id"]}
File Name: {rule["file_name"]}
Action: {rule["action"]}
Protocol: {rule["protocol"]}
Source: {rule["src_addr"]}:{rule["src_port"]}
Destination: {rule["dst_addr"]}:{rule["dst_port"]}
Options: {rule["options"]}
Classification: {rule["suri_rule_classtype"]}
Message: "{rule["suri_rule_msg"]}"

Respond in valid JSON with exactly these fields:
{{
  "most_likely_technique": {{
    "mitre_technique_id": "<Technique ID>",
    "mitre_technique_name": "<Technique Name>",
    "mitre_tactic_id": "<Tactic ID>",
    "mitre_tactic_name": "<Tactic Name>",
    "confidence_score": "<Confidence Score>"
  }},
  "second_most_likely_technique": {{
    "mitre_technique_id": "<Technique ID>",
    "mitre_technique_name": "<Technique Name>",
    "mitre_tactic_id": "<Tactic ID>",
    "mitre_tactic_name": "<Tactic Name>",
    "confidence_score": "<Confidence Score>"
  }},
  "third_most_likely_technique": {{
    "mitre_technique_id": "<Technique ID>",
    "mitre_technique_name": "<Technique Name>",
    "mitre_tactic_id": "<Tactic ID>",
    "mitre_tactic_name": "<Tactic Name>",
    "confidence_score": "<Confidence Score>"
  }}
}}
""".strip()

#############################################
# 4) HELPER: Query the LLM with client.chat
#############################################

def query_llm(prompt, use_gpt4=False):
    """
    Interact with the ChatGPT model using client.chat.completions.create
    (rather than openai.ChatCompletion.create).
    """
    model_name = "gpt-4" if use_gpt4 else "gpt-3.5-turbo"

    system_message = (
        "You are a cybersecurity expert. Your task is to map Suricata IDS rules "
        "to MITRE ATT&CK techniques, providing a confidence score for each technique."
    )

    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
        max_tokens=400,
        temperature=0.0
    )
    return response

#############################################
# 5) MAP FUNCTION: calls query_llm
#############################################

def map_rule_to_mitre(rule, use_gpt4=False):
    """
    Query the LLM to map a Suricata rule to MITRE techniques,
    returning JSON with confidence scores.
    """

    rule_msg = rule["suri_rule_msg"]
    if rule_msg in cache:
        print(f"[CACHE HIT] Using cached result for rule message: {rule_msg}")
        return cache[rule_msg]

    # Build the user prompt, now including all fields
    user_prompt = create_prompt_with_confidence(rule)

    try:
        start_time = time.time()
        response = query_llm(user_prompt, use_gpt4=use_gpt4)
        elapsed_time = time.time() - start_time

        response_text = response.choices[0].message.content.strip()
        print(f"[Rule ID: {rule['suri_rule_id']}] Raw response:\n{response_text}\n")

        # Clean up possible JSON code fences
        cleaned_text = (
            response_text
            .replace("```json", "")
            .replace("```", "")
            .strip()
        )

        # Parse JSON
        parsed_response = json.loads(cleaned_text)

        # Quick sanity check: we expect these top-level keys
        required_top_keys = [
            "most_likely_technique",
            "second_most_likely_technique",
            "third_most_likely_technique"
        ]
        for key in required_top_keys:
            if key not in parsed_response:
                raise ValueError(
                    f"Missing '{key}' in JSON for rule ID {rule['suri_rule_id']}."
                )

        cache[rule_msg] = parsed_response
        print(f"Rule ID {rule['suri_rule_id']} mapped in {elapsed_time:.1f}s\n")
        return parsed_response

    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON for rule ID {rule['suri_rule_id']}: {e}")
        return None
    except Exception as e:
        print(f"Error processing rule ID {rule['suri_rule_id']}: {e}")
        return None

#############################################
# 6) MAIN CSV PROCESSOR
#############################################

def process_csv_and_map_to_mitre(
    csv_input_file,
    json_output_file,
    test_mode=True,
    use_gpt4=False
):
    """
    1) Read Suricata rules from a CSV that has columns:
       file_name, action, protocol, src_addr, src_port, dst_addr, dst_port, options
    2) Extract suri_rule_id, suri_rule_classtype, suri_rule_msg from each row (parsing the 'options' field).
    3) Map each rule to top-3 MITRE techniques with confidence scores.
    4) Write the results to a JSON file, including the original CSV fields.
    """

    rules = []
    with open(csv_input_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)

        for i, row in enumerate(reader, start=1):
            suri_rule_id = f"{row['file_name']}_{i}"

            # Attempt to parse "msg" from the options
            options = row["options"]
            msg_match = MSG_REGEX.search(options)
            if msg_match:
                suri_rule_msg = msg_match.group(1).strip()
            else:
                suri_rule_msg = "(no msg found)"

            # Attempt to parse "classtype" from the options
            classtype_match = CLASSTYPE_REGEX.search(options)
            if classtype_match:
                suri_rule_classtype = classtype_match.group(1).strip()
            else:
                suri_rule_classtype = "(no classtype found)"

            # Build a dictionary with ALL relevant fields
            rule_dict = {
                "suri_rule_id": suri_rule_id,
                "suri_rule_classtype": suri_rule_classtype,
                "suri_rule_msg": suri_rule_msg,
                "file_name": row["file_name"],
                "action": row["action"],
                "protocol": row["protocol"],
                "src_addr": row["src_addr"],
                "src_port": row["src_port"],
                "dst_addr": row["dst_addr"],
                "dst_port": row["dst_port"],
                "options": options
            }
            rules.append(rule_dict)

    print(f"Loaded {len(rules)} rule(s) from CSV: {csv_input_file}")

    # Limit how many we process if test_mode is True
    max_rules = 5 if test_mode else len(rules)
    subset_rules = rules[:max_rules]
    print(f"Processing {len(subset_rules)} rule(s){' (test mode)' if test_mode else ''}...\n")

    mapped_results = []
    for idx, rule in enumerate(subset_rules, start=1):
        print(f"Processing rule {idx}/{len(subset_rules)}: {rule['suri_rule_id']}")
        mapping = map_rule_to_mitre(rule, use_gpt4=use_gpt4)
        if mapping:
            # We'll store all original fields in the final output
            # plus the new MITRE mapping fields.
            result = OrderedDict()

            # Original CSV fields
            result["suri_rule_id"] = rule["suri_rule_id"]
            result["suri_rule_classtype"] = rule["suri_rule_classtype"]
            result["suri_rule_msg"] = rule["suri_rule_msg"]
            result["file_name"] = rule["file_name"]
            result["action"] = rule["action"]
            result["protocol"] = rule["protocol"]
            result["src_addr"] = rule["src_addr"]
            result["src_port"] = rule["src_port"]
            result["dst_addr"] = rule["dst_addr"]
            result["dst_port"] = rule["dst_port"]
            result["options"] = rule["options"]

            # LLM’s JSON fields
            result["most_likely_technique"] = mapping["most_likely_technique"]
            result["second_most_likely_technique"] = mapping["second_most_likely_technique"]
            result["third_most_likely_technique"] = mapping["third_most_likely_technique"]

            mapped_results.append(result)
        else:
            print(f"Skipping rule ID {rule['suri_rule_id']} due to an error.\n")

    # Write output to JSON
    print(f"\nSaving {len(mapped_results)} mapped rules to '{json_output_file}'...")
    with open(json_output_file, "w", encoding="utf-8") as f_out:
        json.dump(mapped_results, f_out, indent=4)

    print(f"Done! Mapped {len(mapped_results)} of {len(subset_rules)} successfully.\n")

#############################################
# 7) ENTRY POINT
#############################################

if __name__ == "__main__":
    # Example usage:
    csv_input_file = "suricata_extracted_rules_random_sampled.csv"
    json_output_file = "testDataMapped_with_confidence.json"

    # Only process 5 rules in test mode
    test_mode = False
    # Switch to True if you want GPT-4
    use_gpt4 = False

    process_csv_and_map_to_mitre(
        csv_input_file=csv_input_file,
        json_output_file=json_output_file,
        test_mode=test_mode,
        use_gpt4=use_gpt4
    )

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
```

Rule ID emerging-dos.rules_325 mapped in 2.6s

Processing rule 326/497: emerging-dos.rules_326
[Rule ID: emerging-dos.rules_326] Raw response:
```json
{
  "most_likely_technique": {
    "mitre_technique_id": "T1498",
    "mitre_technique_name": "Network Denial of Service",
    "mitre_tactic_id": "TA0040",
    "mitre_tactic_name": "Impact",
    "confidence_score": "0.85"
  },
  "second_most_likely_technique": {
    "mitre_technique_id": "T1105",
    "mitre_technique_name": "Ingress Tool Transfer",
    "mitre_tactic_id": "TA0010",
    "mitre_tactic_name": "Execution",
    "confidence_score": "0.70"
  },
  "third_most_likely_technique": {
    "mitre_technique_id": "T1106",
    "mitre_technique_name": "Native API",
    "mitre_tactic_id": "TA0007",
    "mitre_tactic_name": "Defense Evasion",
    "confidence_score": "0.60"
  }
}
```

Rule ID emerging-dos.rules_326 mapped in 2.5s

Processing rule 327/497: emerging-dos.rules

In [None]:
# import pandas as pd
# # Load JSON file
# data = pd.read_json("testDataMapped.json")
# # Save to CSV
# data.to_csv("testDataMapped.csv", index=False)

In [None]:
import pandas as pd
#Load JSON file
data = pd.read_json("testDataMapped_with_confidence.json")
# # Save to CSV
data.to_csv("testDataMapped_with_confidence.csv", index=False)