In [1]:
#code the browse all the files in the rules folder and extract the  rules from the file and put it into csv file suricata_extracted_rules file
#. Below is a breakdown of the individual rule files and the number of rules in each(doNT TAKE the rules that start with #):
#1.	Coresec Rules - 10 rules 2.	Botcc.portgrouped.rules - 0 rules 3.	Botcc.rules - 0 rules 4.	Compromised.rules - 10 rules 5.	Drop.rules - 10 rules 6.	Dshield.rules - 1 rule 7.	Emerging-activex.rules - 19 rules 8.	Emerging-adware_pup.rules - 10 rules 9.	Emerging-attack_responses.rules - 10 rules 10.	Emerging-chat.rules - 10 rules 11.	Emerging-coinminer.rules - 10 rules 12.	Emerging-current_event.rules - 10 rules 13.	Emerging-deleted.rules - 0 rules 14.	Emerging-dns.rules - 10 rules 15.	Emerging-dos.rules - 10 rules 16.	Emerging-dyn-dns.rules - 11 rules 17.	Emerging-exploit_kit.rules - 10 rules 18.	Emerging-exploit.rules - 10 rules 19.	Emerging-file_sharing.rules - 10 rules 20.	Emerging-ftp.rules - 10 rules 21.	Emerging-game.rules - 11 rules 22.	Emerging-hunting.rules - 10 rules 23.	Emerging-icmp.rules - 10 rules 24.	Emerging-imap.rules - 10 rules 25.	Emerging-inappropriate.rules - 0 rules 26.	Emerging-info.rules - 9 rules 27.	Emerging-ja3.rules - 11 rules 28.	Emerging-malware.rules - 10 rules 29.	Emerging-misc.rules - 10 rules 30.	Emerging-mobile_malware.rules - 10 rules 31.	Emerging-netbios.rules - 11 rules 32.	Emerging-p2p.rules - 10 rules 33.	Emerging-phishing.rules - 10 rules 34.	Emerging-pop3.rules - 9 rules 35.	Emerging-remote_access.rules - 11 rules 36.	Emerging-retired.rules - 10 rules 37.	Emerging-rpc.rules - 10 rules 38.	Emerging-scada.rules - 10 rules 39.	Emerging-scan.rules - 10 rules 40.	Emerging-shellcode.rules - 11 rules 41.	Emerging-smtp.rules - 10 rules 42.	Emerging-snmp.rules - 10 rules 43.	Emerging-sql.rules - 10 rules 44.	Emerging-ta_abused_services.rules - 10 rules 45.	Emerging-telnet.rules - 8 rules 46.	Emerging-tftp.rules - 12 rules 47.	Emerging-user_agents.rules - 10 rules 48.	Emerging-voip.rules - 10 rules 49.	Emerging-web_client.rules - 11 rules 50.	Emerging-web_server.rules - 10 rules 51.	Emerging-web_specific_apps.rules - 10 rules 52.	Emerging-worm.rules - 9 rules 53.	Threatview_CS_c2.rules - 10 rules 54.	Tor.rules - 10 rules

In [3]:
import os
import csv
import re

# Regex to capture:
#   1. action (alert, drop, pass, etc.)
#   2. protocol (ip, tcp, udp, etc.)
#   3. src_addr (could be anything from IP to bracketed list, e.g. [1.2.3.0/24, ...])
#   4. src_port
#   5. dst_addr
#   6. dst_port
#   7. everything in parentheses (the rule options)
rule_pattern = re.compile(
    r'^(?P<action>\S+)\s+'        # action
    r'(?P<proto>\S+)\s+'          # protocol
    r'(?P<src_addr>\S+)\s+'       # src_addr
    r'(?P<src_port>\S+)\s+->\s+'  # src_port and arrow
    r'(?P<dst_addr>\S+)\s+'       # dst_addr
    r'(?P<dst_port>\S+)\s*'       # dst_port
    r'\((?P<options>.*)\)$'       # everything inside parentheses
)

def extract_suricata_rules_parsed(rules_folder='rules', output_csv='suricata_extracted_rules_parsed.csv'):
    """
    Scans all .rules files in the given 'rules_folder', extracts Suricata rules that
    do not start with '#', tries to parse them into separate columns, and writes them
    into a CSV file named 'output_csv'.
    """
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "file_name",
            "action",
            "protocol",
            "src_addr",
            "src_port",
            "dst_addr",
            "dst_port",
            "options",      # full text in parentheses
        ])

        for filename in os.listdir(rules_folder):
            if filename.endswith(".rules"):
                file_path = os.path.join(rules_folder, filename)
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as rule_file:
                    for line in rule_file:
                        line = line.strip()
                        if (not line) or line.startswith('#'):
                            continue
                        lower_line = line.lower()
                        if (lower_line.startswith('$id:') or
                            'version' in lower_line or
                            'generated' in lower_line):
                            continue

                        # Attempt to parse with our regex
                        match = rule_pattern.match(line)
                        if match:
                            writer.writerow([
                                filename,
                                match.group('action'),
                                match.group('proto'),
                                match.group('src_addr'),
                                match.group('src_port'),
                                match.group('dst_addr'),
                                match.group('dst_port'),
                                match.group('options'),
                            ])
                        else:
                            # If we can't parse with our regex, just store it in an "unparsed" row
                            # or you could skip it entirely. Here we’ll store with blank parsed fields.
                            writer.writerow([filename, "", "", "", "", "", "", line])

if __name__ == "__main__":
    extract_suricata_rules_parsed()


In [None]:
import csv
import random
from collections import defaultdict

def random_sample_from_csv(
    input_csv='suricata_extracted_rules_parsed.csv',
    output_csv='suricata_extracted_rules_random_sampled.csv',
    max_rules_per_file=10
):
    data_by_filename = defaultdict(list)

    # Read the entire CSV
    with open(input_csv, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            data_by_filename[row['file_name']].append(row)

    fieldnames = reader.fieldnames

    # Write output
    with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()

        for filename, rows in data_by_filename.items():
            # If fewer than max_rules_per_file, random.sample fails.
            # We'll sample min(len(rows), max_rules_per_file).
            sample_count = min(len(rows), max_rules_per_file)
            sampled_rows = random.sample(rows, sample_count)
            for row in sampled_rows:
                writer.writerow(row)

if __name__ == "__main__":
    random_sample_from_csv(
        input_csv='suricata_extracted_rules_parsed.csv',
        output_csv='suricata_extracted_rules_random_sampled.csv',
        max_rules_per_file=10
    )