In [None]:
#__author__ = "Zafar Hussain (University of Helsinki, IVVES Project)"

import re
from typing import List
import numpy as np
import string
import json
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
import random
import nltk
import ntpath
from collections import defaultdict
from uuid import UUID
from functools import reduce
import os
from random import randrange
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from nltk.probability import FreqDist
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import colorcet
import matplotlib.colors
import matplotlib.cm
import bokeh.plotting as bpl
import bokeh.transform as btr
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn import metrics
import colorcet as cc
from sklearn.preprocessing import StandardScaler
import random
from IPy import IP
import socket
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, calinski_harabasz_score


In [None]:
WHITESPACE_QUOTES_REGEX = re.compile(r"""([^\s]*?\".*?\"[^\s]*?|[^\s]*?\'.*?\'[^\s]*?|\'.*?\'|\".*?\"|[^\"\'\s]+)""")
SEMICOLON_REGEX = re.compile(r"""^[^\"']*;[^\"']*$""")

FLAG_SEP_EQUALS_REGEX = re.compile(r"""^[-\/]+[^\=\"\']+\=(([^=]{1}.*)|())$""")
FLAG_SEP_COLON_REGEX = re.compile(r"""^[\/\-][^\:\"\'\=]+\:[^=]*$""")

REDIRECTION_LEFT_REGEX = re.compile(r"""(^[^'"<>]+<[^'"<>]+$)|(^<[^<>]+$)|(^[^<]+<$)""")
DOUBLE_REDIRECTION_LEFT_REGEX = re.compile(r"""(^[^'"<>]+<<[^'"<>]+$)|(^<<[^<>]+$)|(^[^<]+<<$)""")
REDIRECTION_RIGHT_REGEX = re.compile(r"""(^[^'"<>]+>[^'"<>]+$)|(^>[^>]+$)|(^[^<>]+>$)""")
DOUBLE_REDIRECTION_RIGHT_REGEX = re.compile(r"""(^[^'"<>]+>>[^'"<>]+$)|(^>>[^>]+$)|(^[^<>]+>>$)""")

PIPE_REGEX = re.compile(r"""(^[^'"|]+\|[^'"|]+$)|(^\|[^\|]+$)|(^[^\|]+\|$)""")
DOUBLE_PIPE_REGEX = re.compile(r"""(^[^'"\|]+\|\|[^'"\|]+$)|(^\|\|[^\|]+$)|(^[^\|]+\|\|$)""")

test_re = re.compile(r'''"(?=(?:[^"]|'[^"]'|"[^"^"]*)*$)''')

RE_CMD_LEX_LINUX = re.compile(
    r""""((?:\\["\\]|[^"])*)"|'([^']*)'|(\\.)|(&&?|\|\|?|\d?\>|[<])|([^\s'"\\&|<>]+)|(\s+)|(.)"""
)
RE_CMD_LEX_WINDOWS = re.compile(
    r""""((?:""|\\["\\]|[^"])*)"?()|(\\\\(?=\\*")|\\")|(&&?|\|\|?|\d?>|[<])|([^\s"&|<>]+)|(\s+)|(.)"""
)

EMPTY_STRING = "<empty_string>"


class CmdlTokenizer:
    """Tokenizer that splits into command-lines into 'words' corresponding to their execution arguments.

    Current implementation includes Windows and Linux OS tokenization logic in two "modes":
    manual (basic, hard-coded) and custom (trying to resemble cmd.exe arg-parsing).

    This class' main method is the 'tokenize' function, which splits a single command line
    into a list of tokens as strings.

    Attributes:
        mode: identifies which logic is used to extract tokens, one of 'manual' or 'custom'
        os_name: operating system which the command lines come from, one of 'windows' or 'linux'
    """

    def __init__(self, mode="manual", os_name="windows"):
        if mode not in ["manual", "custom"]:
            raise AttributeError(f"Wrong mode '{mode}'. Possible values: 'manual', 'custom'")
        self.mode = mode
        if os_name.lower() not in ["windows", "linux"]:
            raise AttributeError("Wrong os_name. Possible values: 'windows', 'linux'")
        self.os_name = os_name.lower()

    def get_metadata(self):
        metadata = {"mode": self.mode, "os_name": self.os_name}
        if self.mode == "custom":
            metadata["empty_string_tag"] = EMPTY_STRING
        return metadata

    def _get_splitters(self, token):
        """Check if any splits apply to the input token and return a list of splitters if applicable.
        This function's output should be passed to _the get_split_regex method of this class.

        Splitters refer to:
        - flags (options) with values, e.g. --python=python3.6 (equals or slash signs...)
        - pipes and redirections

        Splits logic may wary depending on the os_name and mode.

        Args:
            token (str): single token candidate to splitting into smaller pieces

        Returns:
            tuple (List[str], List[str]): one_char_splitters list followed by multichar_splitters list

        """
        one_char_splitters = []
        multichar_splitters = []

        if (self.os_name == "linux" or (self.os_name == "windows" and self.mode == "manual")) and SEMICOLON_REGEX.match(
            token
        ):
            one_char_splitters.append(";")
        if FLAG_SEP_EQUALS_REGEX.match(token):
            one_char_splitters.append("=")
        if FLAG_SEP_COLON_REGEX.match(token):
            one_char_splitters.append(r"\:")

        if DOUBLE_REDIRECTION_LEFT_REGEX.match(token):
            multichar_splitters.append("<<")
        elif REDIRECTION_LEFT_REGEX.match(token):
            one_char_splitters.append("<")

        if DOUBLE_REDIRECTION_RIGHT_REGEX.match(token):
            multichar_splitters.append(">>")
        elif REDIRECTION_RIGHT_REGEX.match(token):
            one_char_splitters.append(">")

        if DOUBLE_PIPE_REGEX.match(token):
            multichar_splitters.append(r"\|\|")
        elif PIPE_REGEX.match(token):
            one_char_splitters.append(r"\|")
        return one_char_splitters, multichar_splitters

    def _get_split_regex(self, one_char_splitters, multichar_splitters):
        splitters = []
        if one_char_splitters:
            splitters.append(f"[{''.join(one_char_splitters)}]")
        if multichar_splitters:
            splitters.extend(multichar_splitters)
        if len(splitters) >= 1:
            return "(" + "|".join(splitters) + ")"
        else:
            return None

    def _base_split(self, sentence, prog):
     
        if self.mode == "manual" and prog=='cmd.exe' and '""' in sentence:
            return test_re.split(sentence)
        elif self.mode == "manual":
            return WHITESPACE_QUOTES_REGEX.findall(sentence)
        elif self.mode == "custom":
            return self._custom_cmdline_split(sentence)
        return []

    def _custom_cmdline_split(self, s):
        """Multi-platform variant of shlex.split() for command-line splitting.
        For use with subprocess, for argv injection etc. Using fast REGEX.
        """
        args = []
        accu = None  # collects pieces of one arg
        if self.os_name == "linux":
            re_cmd_lex = RE_CMD_LEX_LINUX
        elif self.os_name == "windows":
            re_cmd_lex = RE_CMD_LEX_WINDOWS
        else:
            return args

        is_quote = False
        for qs, qss, esc, pipe, word, white, fail in re_cmd_lex.findall(s):
            if word:
                pass  # most frequent
            elif esc:
                word = esc[1]
            elif white or pipe:
                if accu is not None:
                    args.append(accu)
                if pipe:
                    args.append(pipe)
                is_quote = False
                accu = None
                continue
            elif fail:
                raise ValueError("invalid or incomplete shell string")
            elif qs:
                word = qs.replace('\\"', '"').replace("\\\\", "\\")
                if self.os_name == "windows":
                    word = word.replace('""', '"')
                is_quote = True
            elif qss:
                word = qss  # may be even empty; must be last
            else:
                if not is_quote:
                    word = EMPTY_STRING
            accu = (accu or "") + word
        if accu is not None:
            args.append(accu)
        return args

    def _merge_tokens(self, tokens : List[str]) -> List[str]:
        """Merge some tokens which are more useful glued together in a single token.

        Args:
            tokens (List[str]): list of tokens to be reviewed and merged if applicable

        Returns:
            List[str]: list of tokens after applying merge operations
        """

        def _merge_ampersand(tokens: List[str]) -> List[str]:
            """Obtain redirection-related tokens similar to 2>&1 (Windows-specific)."""
            found = False
            for i in range(len(tokens) - 2):
                if tokens[i] == ">" and tokens[i + 1] == "&":
                    found = True
                    break
            if found:
                i += 1
                replacement = tokens[i] + tokens[i + 1]
                tokens[i] = replacement
                for j in range(i + 1, len(tokens) - 1):
                    tokens[j] = tokens[j + 1]
                tokens = tokens[: len(tokens) - 1]
            return tokens

        def _merge_redirections(tokens:List[str]) -> List[str]:
            """Obtain redirection-related tokens such as >> (Windows-specific)."""
            found = False
            for i in range(len(tokens) - 1):
                if tokens[i] == ">" and tokens[i + 1] == ">":
                    found = True
                    break
            if found:
                replacement = tokens[i] + tokens[i + 1]
                tokens[i] = replacement
                for j in range(i + 1, len(tokens) - 1):
                    tokens[j] = tokens[j + 1]
                tokens = tokens[: len(tokens) - 1]
            return tokens

        tokens = _merge_ampersand(tokens)
        tokens = _merge_redirections(tokens)
        return tokens

    def tokenize(self, sentence: str, prog:str) -> List[str]:
        """Split command line into tokens.

        Args:
            sentence (str): single command line input as string

        Returns:
            List[str]: list of tokens as strings
        """
        raw_tokens: List[str] = []
        for token in self._base_split(sentence, prog):
            one_char_splitters, multichar_splitters = self._get_splitters(token)
            split_regex = self._get_split_regex(one_char_splitters, multichar_splitters)
            if split_regex:
                token = re.split(
                    split_regex,
                    token,
                    maxsplit=len(one_char_splitters) + len(multichar_splitters),
                )
                raw_tokens.extend(filter(None, token))
            else:
                raw_tokens.append(token)
        raw_tokens = self._merge_tokens(raw_tokens)
        return raw_tokens

    
    def tokenize_into_idxs(self, sentence:str) -> List[tuple]:
        tokens = self.tokenize(sentence)
        
        idxs = []
        pos = 0
        for token in tokens:
            token = token.replace("<empty_string>", '""')
            istart = sentence.find(token, pos)
            #if istart < 0:
            #    raise Exception(f'Inconsistent tokenization for token=BEGIN{token}END of sentence BEGIN{sentence}END at pos={pos}.')
            if istart < 0:
                idxs.append((pos, len(sentence)))
                pos = len(sentence)
                break
            else:
                pos = istart+len(token)
                idxs.append((istart, pos))
        return idxs

In [None]:
# traing a tier-2 markov model
def add2dict(dictionary, key, value):
    if key not in dictionary:
        dictionary[key] = []
    dictionary[key].append(value)

def list2probabilitydict(given_list):
    probability_dict = {}
    given_list_length = len(given_list)
    for item in given_list:
        probability_dict[item] = probability_dict.get(item, 0) + 1
    for key, value in probability_dict.items():
        probability_dict[key] = value / given_list_length
    return probability_dict


def train_markov_model(new_lines, program_name):
    initial_word = {}
    second_word = {}
    transitions = {}
    program_name = os.path.splitext(program_name)[0]+'.exe'
    for line in new_lines:
        tokens = tok_class.tokenize(line, program_name)
        tokens = [x.strip() for x in tokens ]
        tokens = [x.strip('"') for x in tokens]
        tokens = [x.replace('\\', ' \\')  for x in tokens if x]
        tokens = [tok.replace(',','') for tok in tokens]
        tokens = [tok.lower() for tok in tokens]
        tokens = [x.split() for x in tokens]
        tokens = [t for tok in tokens for t in tok]   
        tokens = generate_N_grams(tokens, 1)
        tokens = [tok.replace('"','') for tok in tokens]
        #tokens = [tok.split() for tok in tokens]
        #tokens = [t for tok in tokens for t in tok]
        #tokens = [tok.split('-') for tok in tokens]
        #tokens = [t for tok in tokens for t in tok]
        tokens_length = len(tokens)
        for i in range(tokens_length):
            token = tokens[i]#.replace('"', '')
            if i == 0:
                #token = tokens[i].replace('"', '')
                initial_word[token] = initial_word.get(token, 0) + 1
            else:
                prev_token = tokens[i - 1].replace('"', '')
                #prev_token = prev_token.replace('"', '')
                if i == tokens_length - 1:
                    add2dict(transitions, (prev_token, token), '<EOP>')
                if i == 1: 
                    add2dict(second_word, prev_token, token)
                else:
                    prev_prev_token = tokens[i - 2].replace('"', '')
                    #prev_prev_token = prev_prev_token.replace('"', '')
                    add2dict(transitions, (prev_prev_token, prev_token), token)
    
    # Normalize the distributions
    initial_word_total = sum(initial_word.values())
    for key, value in initial_word.items():
        initial_word[key] = value / initial_word_total
        
    for prev_word, next_word_list in second_word.items():
        second_word[prev_word] = list2probabilitydict(next_word_list)
        
    for word_pair, next_word_list in transitions.items():
        transitions[word_pair] = list2probabilitydict(next_word_list)
    

    return initial_word, second_word, transitions

In [None]:
# detect and mask UUID
def mask_uuid(token):
    uuid_regex = r"\{(.*?)\}"
    if '{' in token and '}' in token:
        matches = re.finditer(uuid_regex, token)
        for matchNum, match in enumerate(matches):
            for groupNum in range(0, len(match.groups())):
                try:
                    if match.group(1).lower()== str(UUID(match.group(1).lower())):   
                        token = re.sub(r"{([^{}]+)}", r"UUID", token)
                except ValueError:
                        pass
    elif '.' in token:
        tokens = token.split('.')
        try:
            if UUID(tokens[0]):
                token = 'RANDOM_TOKEN.'+tokens[1]
        except ValueError:
            pass
    else:
        try:
            if UUID(token):
                token = 'RANDOM_TOKEN'
        except ValueError:
            pass   
    return token

In [None]:
#detect and mask numeric values
def mask_files_int_floats(tokens):
    for t in range(len(tokens)):
        token = tokens[t]
        tokens[t] = mask_uuid(token)
        if token.startswith('\\') and token.endswith('.tmp'):
            tokens[t] = '\\Random_Token'
        elif token.endswith('.tmp'):
            tokens[t] = 'Random_Token'
        elif token.startswith('tmp') and len(token.split('.'))==2:
            tokens[t] = 'Random_Token'
        elif token.startswith('\\tmp') and len(token.split('.'))==2:
            tokens[t] = '\\Random_Token'
        elif 'cmdline' in token and token.split('.')[-1]=='cmdline':
            tokens[t] = '\\Random_Token'
        #elif token.startswith('"') and token.endswith('"') and token[1:-1].isdigit():
        #    tokens[t] = 'integer_value'
        elif token.isdigit():
            tokens[t] = 'Random_Token'
        elif token.replace('.','',1).isdigit() and token.count('.') < 2:
            tokens[t]='Random_Token'
        elif validate_iso8601(token):
            tokens[t] = 'Random_Token'
        elif contains_letters_in_order(token, 'RDS'):
            tokens[t] = mask_rds(token)
        elif token.startswith('\\rds') or token.startswith('rds') or token.startswith('\\RDS') or token.startswith('RDS'):
            tokens[t] = mask_rds(token) 
        elif re.search("^[a-zA-Z0-9_]*$", token) and tokens[t-1]=='-event':
            tokens[t] = 'Random_Token'
        elif token.startswith('\\upi') and tokens[t+1][2:].isalnum()  and tokens[t+2][2:].isalnum():
            tokens[t+1]='\\Random_Token'
            tokens[t+2]='\\Random_Token'
        elif token== '-sha256' and tokens[t+1].isalnum() and len(tokens[t+1])==64:
            tokens[t+1]= 'Random_Token'
        elif token== '-token' and tokens[t+1].isalnum() and len(tokens[t+1])==40:
            tokens[t+1]= 'Random_Token'
        elif token =='-ip':
            try:
                socket.inet_aton(tokens[t+1])
                tokens[t+1]='Random_Token'
            except socket.error:
                pass
    return tokens
        

In [None]:
# detect and mask RDS and date-time tokens
def contains_letters_in_order(word, letters):
    regex = '.*'.join(map(re.escape, letters))
    return re.search(regex, word) is not None

def mask_rds(token):
    tt = token
    if token.startswith('\\'):
        tt = token.split('\\')
        for v in range(len(tt)):
            if tt[v].startswith('RDS') or tt[v].startswith('rds'):
                vv = tt[v].split('.')
                if len(vv[0])==15:
                    vv[0]='Random_Token'
                    tt[v] = '.'.join(vv)
            if v<len(tt)-1:
                tt[v]=tt[v]+'\\'
            else:
                tt[v]=tt[v] 
        tt = ''.join(tt)
    elif token.startswith('\\rds') and len(token==15) or token.startswith('\\RDS') and len(token==15):
        tt = 'Random_Token'  
    elif token.startswith('rds'):
        tt='Random_Token'
    else:
        pass
    return tt

def validate_iso8601(str_val):
    date_time_regex = r'^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]+)?(Z|[+-](?:2[0-3]|[01][0-9]):[0-5][0-9])?$'
    date_time_regex2 = r'^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])t(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]+)?(z|[+-](?:2[0-3]|[01][0-9]):[0-5][0-9])?$'
    match_date_time = re.compile(date_time_regex).match
    match_date_time2 = re.compile(date_time_regex2).match
    try:            
        if match_date_time( str_val ) is not None or match_date_time2( str_val ):
            return True
    except:
        pass
    return False

In [None]:
def generate_N_grams(text,ngram=1): 
    temp=zip(*[text[i:] for i in range(0,ngram)])
    ans=[', '.join(ngram) for ngram in temp]
    return ans
def detect_nested_commands(tokens, initial_word, second_word, transitions):
    scores_index = []
    last_index = len(tokens)
    #print(tokens)
    for i in range(len(tokens)): 
        #print(tokens[i]) 
        if i==0 and tokens[i] in initial_word:
            #print(tokens[i], initial_word[string_cmd])
            scores_index.append(initial_word[tokens[i]])
        elif i==0 and tokens[i] not in initial_word:
            scores_index.append(0)
        elif i==1 and tokens[i-1] in second_word.keys():
            next_word = second_word[tokens[i-1]]
            new_cmd = tokens[i]
            if new_cmd in next_word.keys():
                scores_index.append(next_word[new_cmd])
            else:
                scores_index.append(0)
        elif i==1 and tokens[i-1] not in second_word.keys():
            scores_index.append(0)
        elif i>1:
            string_cmd = tokens[i-2]
            next_dict = transitions[(string_cmd, new_cmd)]
            if tokens[i] in next_dict:
                #print(tokens[i], next_dict[tokens[i]])
                scores_index.append(next_dict[tokens[i]])
                new_cmd  =  tokens[i]
            else:
                #print('command not found in model, so the prob is: 0.000001')
                last_index = i
                break
    return tokens, scores_index 


In [None]:
# detect random directories
def detect_random_dirs(commands, initial_word, second_word, transitions, random_number_prob, number_dir_prob, alnum_dir_prob, prog_name):
    random_tokens = []
    for j in range(len(commands)):
        tokens = tok_class.tokenize(commands[j], prog_name+'.exe')
        tokens = [x.strip() for x in tokens ]
        tokens = [tok.replace('"','') for tok in tokens]
        tokens = [x.replace('\\', ' \\')  for x in tokens if x]
        tokens = [tok.replace(',','') for tok in tokens]   
        tokens = [tok.lower() for tok in tokens]
        tokens = [x.split() for x in tokens]
        tokens = [t for tok in tokens for t in tok]
        tokens = generate_N_grams(tokens, 1)
        tokens = [tok.replace('"','') for tok in tokens]
        tokens, prob = detect_nested_commands(tokens, initial_word, second_word, transitions)
        for i in range(len(tokens)):
            if tokens[i].isdigit() and prob[i]<random_number_prob: #0.00005
                tokens[i]= 'Random_Token'
            elif tokens[i].startswith('\\') and tokens[i][1:].isdigit() and prob[i]<number_dir_prob: #0.00009
                tokens[i]= '\\Random_Token'
            elif tokens[i].startswith('\\') and tokens[i][1:].isalnum() and prob[i]<alnum_dir_prob: #0.009
                tokens[i]= '\\Random_Token'
        random_tokens.append([' '.join( tokens)])
    for r in range(len(random_tokens)):
        random_tokens[r] = [random_tok.replace('monitoring host temporary files randomnumber','TempDir') for random_tok in random_tokens[r]]
    return random_tokens

In [None]:
# get the tokens and their probabilities
def command_tokens_prob(command,  initial_word, second_word, transitions, prog_name):
    tokens = tok_class.tokenize(command, prog_name+'.exe')
    tokens = [x.strip() for x in tokens ]
    tokens = [tok.replace('"','') for tok in tokens]
    tokens = [x.replace('\\', ' \\')  for x in tokens if x]
    tokens = [tok.replace(',','') for tok in tokens]   
    tokens = [tok.lower() for tok in tokens]
    tokens = [x.split() for x in tokens]
    tokens = [t for tok in tokens for t in tok]
    tokens = generate_N_grams(tokens, 1)
    tokens = [tok.replace('"','') for tok in tokens]
    tokens, prob = detect_nested_commands(tokens, initial_word, second_word, transitions)
    return tokens, prob

In [None]:
def mask_original_files(new_lines, prog_name):
    masked_lines = []
    for p in range(len(new_lines)):
        tokens = tok_class.tokenize(new_lines[p], prog_name+'.exe')
        tokens = [x.strip() for x in tokens if x.strip()]
        tokens = mask_files_int_floats(tokens)
        masked_lines.append(' '.join(tokens))
    return masked_lines
    
def detect_label_model(new_lines, prog_name, random_number_prob, number_dir_prob, alnum_dir_prob):
    initial_word, second_word, transitions = train_markov_model(new_lines, prog_name)
    random_number_detected_commands = detect_random_dirs(new_lines, initial_word, second_word, transitions, random_number_prob, number_dir_prob, alnum_dir_prob, prog_name)
    random_number_detected_commands = [ran for rand in random_number_detected_commands for ran in rand]
    initial_word, second_word, transitions = train_markov_model(random_number_detected_commands, prog_name)
    tem_dir_detected_commands = detect_random_dirs(random_number_detected_commands, initial_word, second_word, transitions, random_number_prob, number_dir_prob, alnum_dir_prob, prog_name)
    tem_dir_detected_commands = [ran for rand in tem_dir_detected_commands for ran in rand]   
    return tem_dir_detected_commands
    
def command_tokens_freq(command, prog_name):
    tok_class = CmdlTokenizer(os_name = 'windows', mode='manual')
    tokens = tok_class.tokenize(command, prog_name+'.exe')
    tokens = [x.strip() for x in tokens ]
    tokens = [tok.replace('"','') for tok in tokens]
    tokens = [x.replace('\\', ' \\')  for x in tokens if x]
    tokens = [tok.replace(',','') for tok in tokens]   
    tokens = [tok.lower() for tok in tokens]
    tokens = [x.split() for x in tokens]
    tokens = [t for tok in tokens for t in tok]
    tokens = generate_N_grams(tokens, 1)
    return tokens

def avg_freq_dist(new_lines, prog_name):    
    command_tokens = []
    for p in range(len(new_lines)):
        command_tokens.append(command_tokens_freq(new_lines[p], prog_name))
    command_tokens = [word for w in command_tokens for word in w]
    text = command_tokens#"This is an example . This is test . example is for freq dist ."
    fd = FreqDist([word for word in text])
    total = fd.N()
    for word in fd:
        fd[word] /= float(total)
    all_fdist = pd.Series(dict(fd))
    avg_threshold = all_fdist.mean()
    max_threshold = all_fdist.max()
    min_threshold = all_fdist.min()
    return np.mean([np.sqrt(avg_threshold),np.sqrt(max_threshold),np.sqrt(min_threshold)])

In [None]:
def threshold_silhouette_coeff(new_lines, prog_name, avg_threshold): 
    temp_dir_detected_commands=detect_label_model(new_lines,prog_name+'.exe',avg_threshold,avg_threshold,avg_threshold)
    #print('Training successful.')
    masked_lines = mask_original_files(temp_dir_detected_commands, prog_name)
    return masked_lines

In [None]:
# prog_name, os_name, and mode values can be changed
tok_class = CmdlTokenizer(os_name = 'windows', mode='manual')
prog_name = 'cmd'

In [1]:
#read the text (commands)
places = []

# open file and read the content in a list
with open('windows_commands.txt', 'r') as filehandle:
    for line in filehandle:
        # remove linebreak which is the last character of the string
        currentPlace = line[:-1]

        # add item to the list
        places.append(currentPlace)

In [None]:
# detect the threshold value based on the data and mask the random tokens
thr = avg_freq_dist(places, prog_name)
masked_data = threshold_silhouette_coeff(places, prog_name, thr)