In [None]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
from attackcti import attack_client
import pandas as pd

In [None]:
# Initialize the attack_client object
client = attack_client()

# Get all techniques
techniques = client.get_enterprise_techniques()

In [None]:
# Create a list of dictionaries containing the technique ID, name, and description
technique_list = []
for technique in techniques:
    technique_dict = {
        'ID': technique['external_references'][0]['external_id'],
        'Name': technique['name'],
        'Description': technique['description']
    }
    technique_list.append(technique_dict)

# Create a pandas DataFrame from the list of dictionaries
mitre_df = pd.DataFrame(technique_list)

print(mitre_df.columns)

In [32]:
# Function to parse the rule from a given line
def parse_rule(rule_line):
    rule_dict = {}
    # Define pattern for the outer part of the rule (e.g., "alert tcp $HOME_NET any -> $EXTERNAL_NET any")
    outer_pattern = r"(?P<action>\w+) (?P<protocol>\w+) (?P<src>\$[A-Z_]+ [a-z]*) -> (?P<dst>\$[A-Z_]+ [a-z]*)"
    # Attempt to match the outer pattern
    outer_match = re.match(outer_pattern, rule_line)
    if outer_match:
        # If outer part is matched, add it to the rule dictionary
        rule_dict.update(outer_match.groupdict())
        # Define pattern for the inner part of the rule (e.g., "(msg:"RULE_MSG"; sid:1000006; rev:001;)")
        inner_pattern = r"\((?P<inner_contents>.*)\)"
        # Attempt to match the inner pattern
        inner_match = re.search(inner_pattern, rule_line)
        if inner_match:
            # If inner part is matched, further process it
            inner_contents = inner_match.group("inner_contents").split(";")
            for content in inner_contents:
                content = content.strip()
                if ":" in content:
                    # Split each content on ":" to get key-value pairs and add them to the rule dictionary
                    key, value = content.split(":", 1)
                    rule_dict[key.strip()] = value.strip('" ')
    return rule_dict

# Define the folder where the rules are stored
folder_path = 'D:/python/snort mitre/rules'  
# Prepare a list to store all parsed rules
all_rules = []

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    # Create the full file path by joining the folder path and the filename
    file_path = os.path.join(folder_path, filename)
    # Open the file in read mode
    with open(file_path, 'r', encoding='utf-8') as f:
        # Loop over each line in the file
        for line in f:
            line = line.strip()
            # Only process lines that start with 'alert' (i.e., are rule lines)
            if line.startswith('alert'):  
                # Parse the rule and add it to the list
                rule_dict = parse_rule(line)
                all_rules.append(rule_dict)

# Download the stopwords from NLTK
stop_words = set(stopwords.words('english'))
# Define additional stopwords specific to the cybersecurity context
cyber_stop_words = {'attempt', 'other', 'known', 'org','user'}
# Update the list of stopwords
stop_words.update(cyber_stop_words)

# Prepare a dictionary to count the frequency of each term
# Use a defaultdict to avoid KeyErrors and automatically initialize missing keys to 0
term_frequencies = defaultdict(int)
# Define the fields where keywords should be extracted from
keyword_fields = ['msg']

# Loop over each parsed rule
for rule in all_rules:
    # Loop over each keyword field
    for field in keyword_fields:
        # If the field is present in the rule
        if field in rule:
            # Use NLTK's word_tokenize function to tokenize the field's value
            tokens = word_tokenize(rule[field])
            # Loop over each token
            for token in tokens:
                # Ignore tokens that are either too short or are stop words
                if len(token) > 2 and token not in stop_words:
                    # Increment the term's count
                    term_frequencies[token] += 1

# Find the 20 most common terms and their frequencies
most_common_terms = sorted(term_frequencies.items(), key=lambda x: x[1], reverse=True)
# Print the most common terms and their frequencies
ordered_keywords = []
for term, freq in most_common_terms:
    ordered_keywords.append(term)


In [33]:
len(ordered_keywords)

3474

In [25]:
# Pre-processing the MITRE techniques
mitre_df['tokenized_desc'] = mitre_df['Description'].apply(word_tokenize)
mitre_df['tokenized_desc'] = mitre_df['tokenized_desc'].apply(lambda x: [word for word in x if word not in stop_words])


In [35]:
# Initial preparation
lut = []
keyword_counts = {}  # Dictionary to keep track of total keyword counts across all techniques

for keyword in ordered_keywords:
    # Find techniques that mention the keyword in their description
    relevant_techniques = mitre_df[mitre_df['tokenized_desc'].apply(lambda desc: keyword in desc)].copy()

    if not relevant_techniques.empty:
        # Count how many times the keyword appears in each description
        relevant_techniques['keyword_count'] = relevant_techniques['tokenized_desc'].apply(lambda desc: desc.count(keyword))
        # Add the total count to the keyword_counts dictionary
        keyword_counts[keyword] = relevant_techniques['keyword_count'].sum()
        # Sort techniques by how often they mention the keyword
        relevant_techniques = relevant_techniques.sort_values('keyword_count', ascending=False)

        # Prepare LUT entry
        lut_entry = {'keyword': keyword}

        # Prepare list to store notes about non-top techniques
        non_top_techniques = []

        for i, (index, row) in enumerate(relevant_techniques.iterrows()):
            technique = row['Name']
            keyword_count = row['keyword_count']

            # Compute confidence score as the ratio of keyword_count for the technique 
            # to the total count of the keyword in all descriptions, round to 2 decimal places
            confidence = round(keyword_count / keyword_counts[keyword], 2)

            if i < 2:
                # Top techniques
                lut_entry[f'tech_{i+1}'] = technique
                lut_entry[f'tech_{i+1}_confidence'] = confidence
            else:
                # Non-top techniques, store for notes
                non_top_techniques.append(f'{technique} (confidence: {confidence})')

        # Convert list of non-top techniques into a single string and add it to the LUT entry
        lut_entry['notes'] = '; '.join(non_top_techniques)

        lut.append(lut_entry)

    # Stop when the LUT has 200 entries
    if len(lut) >= 200:
        break

# Convert the LUT to a DataFrame
lut_df = pd.DataFrame(lut)
lut_df.to_csv('keyword_MITRE_LUT.csv', index=False)
lut_df.to_excel('keyword_MITRE_LUT.xlsx', index=False)
