In [2]:
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import pandas as pd
import cv2
from networkx.drawing.nx_pydot import  pydot_layout
#from google.colab.patches import cv2_imshow
from difflib import SequenceMatcher
import json
import re
from random import randrange

In [3]:
def draw_tree_one_command(labels, nodes):

    '''
    We are calling this function when there is only one COMM in the command. 
    nodes: tokens
    labels: the labels we assigned to each token
    We are connecting each token/node, with the COMM (which is considered as initial node)
    We are labeling each edge with the label, e.g. COMM, SUBCOMM, FLAG etc. 
    '''
    G = nx.DiGraph()
    for i in range(1,len(labels)+1):
        if i <len(labels)+1:
            if len(labels)==1:
                G.add_edge(nodes[i-1], nodes[i-1], label=labels[i-1])
                break
            if labels[i-1]=='COMM':
                G.add_edge(nodes[i-1], nodes[i-1], label='COMM') 
            elif labels[i-1]=='FLAG':
                G.add_edge(nodes[labels.index('COMM')], nodes[i-1],  label=labels[i-1])
            elif labels[i-1]== 'FLAG_VALUE':
                G.add_edge(nodes[i-2], nodes[i-1], label=labels[i-1])
            elif labels[i-1]== 'FLAG_SEPARATOR':
                G.add_edge(nodes[i-2], nodes[i-1], label=labels[i-1])
            elif labels[i-1]=='FLAG_WITH_FLAG_VALUE':
                G.add_edge(nodes[labels.index('COMM')], nodes[i-1],  label='FLAG_WITH_FLAG_VALUE')
            elif labels[i-1]=='PARAM':
                G.add_edge(nodes[labels.index('COMM')], nodes[i-1],  label='PARAM')
            elif labels[i-1]=='OPERATOR':
                G.add_edge(nodes[labels.index('COMM')], nodes[i-1],  label='OPERATOR')
            elif labels[i-1]=='SUBCOMM':
                G.add_edge(nodes[labels.index('COMM')], nodes[i-1],  label='SUBCOMM')
            elif labels[i-1]=='OPERATOR' and labels[i]=='OPERATOR_PARAM':
                G.add_edge(nodes[labels.index('COMM')], nodes[i-1],  label=labels[i-1])
                G.add_edge(nodes[i-1], nodes[i], label=labels[i])
            elif labels[i-1]=='COMM_SCRIPT':
                G.add_edge(nodes[labels.index('COMM')], nodes[i-1],  label='COMM_SCRIPT')
            elif labels[i-1]=='SCRIPT':
                G.add_edge(nodes[labels.index('COMM')], nodes[i-1],  label='SCRIPT')
    return G

def draw_tree_multiple_command(labels, nodes):

    '''
    We are calling this function when there is more than one COMM in the command. 
    nodes: tokens
    labels: the labels we assigned to each token
    We extracted the indices of COMM, and then while going over each index, if the index is in the extracted 
    indices, that index is marked as COMM, and the subsequent tokens are connected with this node. 
    We are labeling each edge with the label, e.g. COMM, SUBCOMM, FLAG etc. 
    '''
    G = nx.DiGraph()
    indices = [i for i, x in enumerate(labels) if x == 'COMM']
    j=1
    for i in range(1,len(labels)+1):        
        if i-1==0:
            base_comd = nodes[i-1]
            G.add_edge(base_comd, base_comd, label=labels[i-1])
            G.add_edge(base_comd, nodes[i], label=labels[i])
            new_cmd = nodes[i-1]
        elif i-1==indices[j]:
            G.add_edge(new_cmd, nodes[indices[j]], label=labels[indices[j]])
            new_cmd = nodes[indices[j]]
            if j+1<len(indices):
                j+=1
            else:
                j=j 
        if i <len(nodes)+1:
            if labels[i-1]=='FLAG':
                G.add_edge(new_cmd, nodes[i-1],  label=labels[i-1])
            elif labels[i-1]== 'FLAG_VALUE':
                G.add_edge(nodes[i-2], nodes[i-1], label=labels[i-1])
            elif labels[i-1]== 'FLAG_SEPARATOR':
                G.add_edge(nodes[i-2], nodes[i-1], label=labels[i-1])
            elif labels[i-1]=='FLAG_WITH_FLAG_VALUE':
                G.add_edge(new_cmd, nodes[i-1],  label=labels[i-1])
            elif labels[i-1]=='PARAM':
                G.add_edge(new_cmd, nodes[i-1],  label=labels[i-1])
            elif labels[i-1]=='OPERATOR':
                G.add_edge(new_cmd, nodes[i-1],  label=labels[i-1])
            elif labels[i-1]=='SUBCOMM':
                G.add_edge(new_cmd, nodes[i-1],  label=labels[i-1])
            elif labels[i-1]=='OPERATOR' and labels[i]=='OPERATOR_PARAM':
                G.add_edge(new_cmd, nodes[i-1],  label=labels[i-1])
                G.add_edge(nodes[i-1], nodes[i], label=labels[i])
            elif labels[i-1]=='COMM_SCRIPT':
                G.add_edge(new_cmd, nodes[i-1],  label=labels[i-1])
            elif labels[i-1]=='SCRIPT':
                G.add_edge(new_cmd, nodes[i-1],  label=labels[i-1])   
    return G

In [4]:
def create_dict_nodes_labels(nodes, labels):
    '''
    Given nodes and labels, we create a dictionary where labels (COMM, SUBCOMM, FLAG etc.) are
    keys and tokens are the values of these keys.'''

    gr_data = dict()
    for i in range(len(nodes)):
        if nodes[i] not in gr_data:
            gr_data[nodes[i]] = []
        gr_data[nodes[i]].append(labels[i])
    return gr_data

def similar(a, b):
    return round(SequenceMatcher(None, a, b).ratio(),3)

def plot_final_graph(pdot):

    ''' Using cv2 to show the pydot graph as an image'''
    
    png_path = "test.png"
    pdot.write_png(png_path)
    img = cv2.imread('test.png')  
    cv2.imshow('image',img)  
    cv2.waitKey(0)         
    cv2.destroyAllWindows()


In [5]:
def verify_combinations(U, gr_dt_1, gr_dt_2, similarity_combinations):

    ''' 
    U: final graph
    gr_dt_1: dictionary of tokens and labels of command one
    gr_dt_2: dictionary of tokens and labels of command two
    similarity_combinations: this is a reference table, where we have 54 combinations and their class
    '''
    U_edges = list(U.edges.data())
    U_nodes = list(U.nodes())
    comm_value, subcomm_value,flag_value, param_value = 0, -1, -1, -1
    result = []
    unique_edges = []
    wind_cmd_columns = wind_comd_sim_ref.columns
    comm_1_edges = gr_dt_1.keys()
    comm_2_edges = gr_dt_2.keys()
    for j in range(len(U_edges)):
        new_edge = U_edges[j]
        node_name = new_edge[1]
        edge_name = new_edge[2]['label']
        if edge_name not in unique_edges:
            unique_edges.append(edge_name)
    if unique_edges[0]!='COMM':
        comm_index = unique_edges.index('COMM')
        unique_edges[0], unique_edges[comm_index] = 'COMM', unique_edges[0]
    if 'SUBCOMM' in unique_edges and unique_edges[1]!='SUBCOMM':
        subcomm_index = unique_edges.index('SUBCOMM')
        unique_edges[1], unique_edges[subcomm_index] = 'SUBCOMM', unique_edges[1]
        
    #regex_rule = re.compile("^(?=.*[a-zA-Z])(?=.*[\d])(?=.*[\-])")
    for edge_name in unique_edges:
        if edge_name=='COMM':
            comm_1 = gr_dt_1['COMM']   
            comm_2 = gr_dt_2['COMM']
            final_index_1 = max(index for index, item in enumerate(comm_1))
            final_index_2 = max(index for index, item in enumerate(comm_2))
            if comm_1[final_index_1]==comm_2[final_index_2]:
                comm_value=1
            elif comm_1[final_index_1]!=comm_2[final_index_2]:
                base_cmd_sliced = comm_1[final_index_1].split('\\')[-1].split('.')[0].replace('"', '')
                new_cmd_sliced = comm_2[final_index_2].split('\\')[-1].split('.')[0].replace('"', '')
                if base_cmd_sliced in wind_cmd_columns and new_cmd_sliced in wind_cmd_columns:
                    sim_score_ref = round(wind_comd_sim_ref[base_cmd_sliced][new_cmd_sliced],4)
                    if sim_score_ref>0.80:
                        comm_value=1
                    else:
                        comm_value=0
                elif similar(comm_1[final_index_1],comm_2[final_index_2])>=0.95:
                    comm_value=1
                elif base_cmd_sliced==new_cmd_sliced:
                    comm_value=1
                else:
                    comm_value=0
        elif edge_name=='SUBCOMM':
            if edge_name in comm_1_edges and edge_name in comm_2_edges:
                res_subcomm = len(set(gr_dt_1[edge_name]) & set(gr_dt_2[edge_name])) / float(len(set(gr_dt_1[edge_name]) | set(gr_dt_1[edge_name])))  
                if res_subcomm>0.99:
                    subcomm_value=1
                else:
                    subcomm_value=0
            elif edge_name in comm_1_edges or edge_name in comm_2_edges:
                subcomm_value = 0
            else:           
                subcomm_value=-1
        elif edge_name=='FLAG':
            if edge_name in comm_1_edges and edge_name in comm_2_edges:
                res_flags = len(set(gr_dt_1[edge_name]) & set(gr_dt_2[edge_name])) / float(len(set(gr_dt_1[edge_name]) | set(gr_dt_1[edge_name])))
                if res_flags>0.89:
                    flag_value=1
                else:
                    flag_value=0
            elif edge_name in comm_1_edges or edge_name in comm_2_edges:
                flag_value = 0
            else:
                flag_value=-1           
        elif edge_name=='PARAM':
            param_score=0
            if edge_name in comm_1_edges and edge_name in comm_2_edges:
                if comm_value==1 or subcomm_value==1:
                    param_1 = gr_dt_1['PARAM']   
                    param_2 = gr_dt_2['PARAM']
                    for z in range(max(len(param_1), len(param_2))):
                        if z<len(param_1) and z<len(param_2):
                            if param_1[z]==param_2[z] or similar(param_1[z],param_2[z])>=0.75:
                                param_score+=1
                            elif param_1[z].isnumeric() and param_2[z].isnumeric():
                                param_score+=1
                            #elif(regex_rule.search(param_1[z])!= None) and (regex_rule.search(param_2[z])!= None):
                            #    param_score+=1
                        else:
                            param_score+=0
                    if param_score/max(len(param_1),len(param_2))>0.65:
                        param_value=1
                    else:
                        param_value=0                                               
                else:
                    print('this is set', set(gr_dt_1[edge_name]))
                    print('this is set', set(gr_dt_2[edge_name]))
                    res_param = len(set(gr_dt_1[edge_name]) & set(gr_dt_2[edge_name])) / float(len(set(gr_dt_1[edge_name]) | set(gr_dt_1[edge_name])))
                    print(res_param)
                    if res_param >0.65:
                        param_value=1
                    else:
                        param_value=0
            elif edge_name in comm_1_edges or edge_name in comm_2_edges:
                param_value = 0
            else:
                param_value=-1       
        else:
            pass
    result.append([comm_value, subcomm_value,flag_value,  param_value])
    result = [item for sublist in result for item in sublist]
    key_names = ['COMM', 'SUBCOMM', 'FLAG','PARAM']
    class_output= similarity_combinations[(similarity_combinations[key_names] == result).all(1)]['CLASS'].values[0]
    return class_output


In [6]:
def compare_graphs(nodes1, labels1, nodes2, labels2, similarity_combinations):
    
    '''
    nodes1 and labels1 represent the base_command (command one)
    nodes2 and labels2 represent the new command which is being compared with the base command
    similarity_combinations: this is a reference table, where we have 54 combinations and their class
    For now we have faced one instance where COMM and FLAG had the same values, such as pip. So
    we are marking COMM 'pip' as 'PIP' and FLAG 'pip' remains 'pip'.
    '''

    if labels1[nodes1.index('COMM')]=='pip':
        labels1[nodes1.index('COMM')]='PIP'
    if labels2[nodes2.index('COMM')]=='pip':
        labels2[nodes2.index('COMM')]='PIP'
    
    ''' Networkx has some issues with differentiating between single quote (') and double quotes ('').
    Some of the tokens start with single quote, then some characters followed by double quotes,
    so we are putting the whole token in double quotes.
    Same issue with ':', for example http: was being divided in two parts, first part made of the word
    'http' and the next part made of 'www.something.com'. So by putting double quotes around it, networkx 
    treat it as one token '''

    for t in range(len(labels1)):
        if labels1[t].startswith('C:') or labels1[t].startswith('http'):
            labels1[t] = f'"{labels1[t]}"'
        elif "\"C:" in labels1[t]:
            labels1[t] = labels1[t].replace('"', '')
            labels1[t] = f'"{labels1[t]}"'
        elif ":" in labels1[t]:
            labels1[t] = labels1[t].replace('"', '')
            labels1[t] = f'"{labels1[t]}"'
    for s in range(len(labels2)):
        if labels2[s].startswith('C:') or labels2[s].startswith('http'):
            labels2[s] = f'"{labels2[s]}"'
        elif "\"C:" in labels2[s]:
            labels2[s] = labels2[s].replace('"', '')
            labels2[s] = f'"{labels2[s]}"'
        elif ":" in labels2[s]:
            labels2[s] = labels2[s].replace('"', '')
            labels2[s] = f'"{labels2[s]}"'
    
    if nodes1.count('COMM')>1:
        g1 = draw_tree_multiple_command(nodes1, labels1)
    else:
        g1 = draw_tree_one_command( nodes1, labels1)
    if nodes2.count('COMM')>1:
        g2 = draw_tree_multiple_command( nodes2, labels2)
    else:
        g2 = draw_tree_one_command(nodes2, labels2)
       
    # Edges and nodes of both the generated trees are extracted here
    g1_ed = list(g1.edges.data())
    g2_ed = list(g2.edges.data())
    g2_nd = list(g2.nodes)
    g1_nd = list(g1.nodes) 
    
    # A new graph is created by uniting edges and nodes of both the trees.  
    U=nx.DiGraph()
    U.add_edges_from(g1_ed+g2_ed)
    U.add_nodes_from(g1_nd+g2_nd)

    # The new graph/tree along the already generated trees are converted to pydot layout for visualization.
    pdot = nx.drawing.nx_pydot.to_pydot(U)
    pdot1 = nx.drawing.nx_pydot.to_pydot(g1)
    pdot2 = nx.drawing.nx_pydot.to_pydot(g2)
    
    # node names of 3 trees extracted.
    pdotnodes = pdot.get_nodes()
    pdotnodes1 = pdot1.get_nodes()
    pdotnodes2 = pdot2.get_nodes()
    node_names1 = [node.get_name() for node in pdotnodes1]
    node_names2 = [node.get_name() for node in pdotnodes2]
       
    for node in pdotnodes:
        node_name = node.get_name()
        if node_name in node_names1 and node_name in node_names2:
            node.set_color('green')  
             
        elif node_name in node_names1 and node_name  not in node_names2:
            node.set_color('blue') 
        else:
            node.set_color('red')
    
    # Calling the function with nodes and labels to create dictionaries
    gr_dt_1 = create_dict_nodes_labels(nodes1, labels1)
    gr_dt_2 = create_dict_nodes_labels(nodes2, labels2)

    #Plotting the final graph
    #plot_final_graph(pdot)
    #Classifying the given two commands.
    class_output = verify_combinations(U, gr_dt_1, gr_dt_2, similarity_combinations)

    # Combining/joining the tokens back to the original form. We will later use this as a training data. 
    cmd_one, cmd_two = '', ''
    for n1 in labels1:
        cmd_one+=n1+' '
    #cmd_one = cmd_one.replace('"', ' ')
    for n2 in labels2:
        cmd_two+=n2+' '
    #cmd_two = cmd_two.replace('"', ' ')
    return cmd_one, cmd_two, class_output
    

In [85]:
similarity_combinations = pd.read_csv('./combinations/similarity_combinations.csv')
wind_comd_sim_ref = pd.read_csv('./commands_similarities/data/windows/windows_desc_sim_df.csv', index_col=0)
with open('./commands_analysis/result_windows_commands.txt', 'r') as wind_file:
        wind_cmd_res = json.load(wind_file)


In [7]:
create_train_data=[]
for w in range(0, len(wind_cmd_res)):
    labels1 = wind_cmd_res[w][1]
    tokens1 = wind_cmd_res[w][0]
    if w <12253:
        for p in range(10):
            random_index = p+w
            labels2 = wind_cmd_res[random_index][1]
            tokens2 = wind_cmd_res[random_index][0]
            if labels1.count('COMM')>0 and labels1.index('COMM')==0 and labels2.count('COMM')>0 and labels2.index('COMM')==0:
                cmd_one, cmd_two, class_output = compare_graphs(labels1, tokens1, labels2, tokens2, similarity_combinations)
                create_train_data.append([cmd_one, cmd_two, class_output])
            else:
                pass
    else:
        for p in range(w, len(wind_cmd_res)):
            random_index = p
            labels2 = wind_cmd_res[random_index][1]
            tokens2 = wind_cmd_res[random_index][0]
            if labels1.count('COMM')>0 and labels1.index('COMM')==0 and labels2.count('COMM')>0 and labels2.index('COMM')==0:
                cmd_one, cmd_two, class_output = compare_graphs(labels1, tokens1, labels2, tokens2, similarity_combinations)
                create_train_data.append([cmd_one, cmd_two, class_output])
            else:
                pass

In [8]:
create_train_data_df = pd.DataFrame(create_train_data, columns=['comd_one', 'comd_two', 'output'])
#create_train_data_df.to_csv('./supervised_learning/data/training_data.csv', index=False)

In [None]:
create_train_data_random=[]
for w in range(1, len(wind_cmd_res)):
    labels1 = wind_cmd_res[w][1]
    tokens1 = wind_cmd_res[w][0]
    for p in range(100):
        random_index = randrange(12261)
        labels2 = wind_cmd_res[random_index][1]
        tokens2 = wind_cmd_res[random_index][0]
        if labels1.count('COMM')>0 and labels1.index('COMM')==0 and labels2.count('COMM')>0 and labels2.index('COMM')==0:
            cmd_one, cmd_two, class_output = compare_graphs(labels1, tokens1, labels2, tokens2, similarity_combinations)
            create_train_data_random.append([cmd_one, cmd_two, class_output])
        else:
            pass

In [None]:
create_train_data_random_df = pd.DataFrame(create_train_data_random, columns=['comd_one', 'comd_two', 'output'])
#create_train_data_df.to_csv('./supervised_learning/data/enriched_train_data.csv', index=False)

In [None]:
len(create_train_data_random_df)

In [None]:
merged_data = create_train_data_random_df.append(create_train_data_df, ignore_index=True)

In [None]:
df = merged_data.sample(frac=1).reset_index(drop=True)
df.to_csv('./supervised_learning/data/merged_train_data.csv', index=False)

In [22]:
superised_data = pd.read_csv('./supervised_learning/data/merged_train_data.csv')

In [23]:
superised_data.head()

Unnamed: 0,comd_one,comd_two,output
0,"""C:\Windows\Microsoft.NET\Framework64\v4.0.303...","""LogonUI.exe"" /flags "":"" 0x0 /state0 "":"" 0xa11...",Not-Similar
1,"""C:\WINDOWS\system32\WindowsPowerShell\v1.0\Po...","""C:\Windows\Microsoft.NET\Framework64\v4.0.303...",Not-Similar
2,"""C:\WINDOWS\System32\sihclient.exe"" /cv 7gTB8U...","""C:\Windows\System32\MsiExec.exe"" /Y ""C:\WINDO...",Not-Similar
3,"""C:\Windows\Microsoft.NET\Framework\v4.0.30319...","""C:\WINDOWS\system32\WindowsPowerShell\v1.0\Po...",Not-Similar
4,"""C:\Windows\Microsoft.NET\Framework64\v4.0.303...","""C:\Windows\Microsoft.NET\Framework64\v4.0.303...",Similar


In [34]:
empty_list = []
with open('sample_data.txt') as txt_file:
    lines = txt_file.readlines()
txt_file.close()

In [36]:
lines = open("sample_data.txt", "r")
lines.close()
#lines = [x.rstrip("\n") for x in lines.readlines()]


In [27]:
df = pd.DataFrame (lines,columns=['commands'])

In [29]:
df.to_csv('sample_commands_for_validation.csv', index=False)

In [273]:
#sample_cmds = pd.read_csv('sample_commands_for_validation.csv')
#sample_cmds

In [77]:
with open('validation_data.txt', 'r') as wind_file:
        wind_cmd_samples = wind_file.readlines()

In [129]:
token_label_commands = []
for j in range(len(wind_cmd_samples)):
    commands = []
    labels=[]
    aa = wind_cmd_samples[j]
    aa = aa.split('\t')
    aa = aa[-1].split('>')
    for i in range(len(aa)-1):
        bb = aa[i].split('<')
        commands.append(bb[0])
        labels.append(bb[1])
    token_label_commands.append([commands, labels])

In [83]:
with open('./commands_syntax_combinations/validation_commands.txt', 'r') as wind_file:
        wind_cmd_valid = json.load(wind_file)

In [86]:
create_valid_data=[]
for w in range(0, len(wind_cmd_valid)):
    labels1 = wind_cmd_valid[w][1]
    tokens1 = wind_cmd_valid[w][0]
    for p in range(w, len(wind_cmd_valid)):
        labels2 = wind_cmd_valid[p][1]
        tokens2 = wind_cmd_valid[p][0]
        if labels1.count('COMM')>0 and labels1.index('COMM')==0 and labels2.count('COMM')>0 and labels2.index('COMM')==0:
            cmd_one, cmd_two, class_output = compare_graphs(labels1, tokens1, labels2, tokens2, similarity_combinations)
            create_valid_data.append([cmd_one, cmd_two, class_output])
        else:
            pass

this is set {'"c:users "', 'flarefile.txt '}
this is set {'file1.txt', 'file2.txt'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'"C:"'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'"D:\\backup\\file2.doc"', '"C:\\data\\file1.doc"'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'start=', 'WSearch', 'disabled'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'auto ', 'start=', 'WSearch'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'mydoc.txt'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'git_shell_ext64.dll'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'git_shell_ext.dll'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'"c:\\sys\\junk.txt"'}
0.0
this is set {'"c:users "', 'flarefile.txt '}
this is set {'"http.sslcainfo=C:\\Program Files\\G

this is set {'"C:"'}
this is set {'settings-development.json'}
0.0
this is set {'"C:"'}
this is set {'ns'}
0.0
this is set {'"C:"'}
this is set {'nodes'}
0.0
this is set {'"C:"'}
this is set {'pods'}
0.0
this is set {'"C:"'}
this is set {'svc'}
0.0
this is set {'"C:"'}
this is set {'h+fr48ab/EOjBGv5q1vIVQ.0 '}
0.0
this is set {'"C:"'}
this is set {'cli'}
0.0
this is set {'"C:"'}
this is set {'config'}
0.0
this is set {'"D:\\backup\\file2.doc"', '"C:\\data\\file1.doc"'}
this is set {'start=', 'WSearch', 'disabled'}
0.0
this is set {'"D:\\backup\\file2.doc"', '"C:\\data\\file1.doc"'}
this is set {'auto ', 'start=', 'WSearch'}
0.0
this is set {'"D:\\backup\\file2.doc"', '"C:\\data\\file1.doc"'}
this is set {'mydoc.txt'}
0.0
this is set {'"D:\\backup\\file2.doc"', '"C:\\data\\file1.doc"'}
this is set {'git_shell_ext64.dll'}
0.0
this is set {'"D:\\backup\\file2.doc"', '"C:\\data\\file1.doc"'}
this is set {'git_shell_ext.dll'}
0.0
this is set {'"D:\\backup\\file2.doc"', '"C:\\data\\file1.doc

this is set {'auto ', 'start=', 'WSearch'}
this is set {'REG_SZ'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'user32.dll', 'True'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'keymgr.dll'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'matplotlib', 'install'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'"\\?\\C:\\Windows\\SysWOW64\\KernelBase.dll"', '0000000074A30000', '\\Device\\HarddiskVolume9\\Windows\\SysWOW64\\KernelBase.dll'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'"\\?\\C:\\Windows\\SysWOW64\\advapi32.dll"', '00000000749B0000', '\\Device\\HarddiskVolume9\\Windows\\SysWOW64\\advapi32.dll'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'settings-development.json'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'ns'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'nodes'}
0.0
this is set {'auto ', 'start=', 'WSearch'}
this is set {'pods'}
0.0
this is set {'aut

this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
this is set {'"c:\\sys\\junk.txt"'}
0.0
this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
this is set {'"http.sslcainfo=C:\\Program Files\\Git\\mingw64\\ssl\\certs\\ca-bundle.crt"'}
0.0
this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
this is set {'"http.sslbackend=openssl"'}
0.0
this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
this is set {'diff.astextplain.textconv=astextplain'}
0.0
this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
0.0
this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
this is set {'smudge', '%f', 'filter.lfs.smudge=git-lfs'}
0.0
this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
this is set {'filter.lfs.process=git-lfs', 'filter-process'}
0.0
this is set {'"php:7.2-cli"', 'run', 'docker', 'php', 'bin/app.php'}
this is set {'filter.

this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set {'www.google.com '}
0.0
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set {'"C:\\wamp\\www\\temp\\hah.pdf"'}
0.0
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set {'"C:\\folder_and_file_to_be_transferred.pdf ftps://host.top_level_domain.com/filename.pdf"'}
0.0
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set {'REG_SZ'}
0.0
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set {'user32.dll', 'True'}
0.0
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set {'keymgr.dll'}
0.0
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set {'matplotlib', 'install'}
0.0
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set {'"\\?\\C:\\Windows\\SysWOW64\\KernelBase.dll"', '0000000074A30000', '\\Device\\HarddiskVolume9\\Windows\\SysWOW64\\KernelBase.dll'}
0.0
this is set {'clean', '%f', 'filter.lfs.clean=git-lfs'}
this is set

this is set {'core.editor=', '"C:\\Program Files (x86)\\Notepad++\\notepad++.exe"'}
this is set {'www.google.com '}
0.0
this is set {'core.editor=', '"C:\\Program Files (x86)\\Notepad++\\notepad++.exe"'}
this is set {'"C:\\wamp\\www\\temp\\hah.pdf"'}
0.0
this is set {'core.editor=', '"C:\\Program Files (x86)\\Notepad++\\notepad++.exe"'}
this is set {'"C:\\folder_and_file_to_be_transferred.pdf ftps://host.top_level_domain.com/filename.pdf"'}
0.0
this is set {'core.editor=', '"C:\\Program Files (x86)\\Notepad++\\notepad++.exe"'}
this is set {'REG_SZ'}
0.0
this is set {'core.editor=', '"C:\\Program Files (x86)\\Notepad++\\notepad++.exe"'}
this is set {'user32.dll', 'True'}
0.0
this is set {'core.editor=', '"C:\\Program Files (x86)\\Notepad++\\notepad++.exe"'}
this is set {'keymgr.dll'}
0.0
this is set {'core.editor=', '"C:\\Program Files (x86)\\Notepad++\\notepad++.exe"'}
this is set {'matplotlib', 'install'}
0.0
this is set {'core.editor=', '"C:\\Program Files (x86)\\Notepad++\\notepad++

this is set {'user32.dll', 'True'}
this is set {'svc'}
0.0
this is set {'user32.dll', 'True'}
this is set {'h+fr48ab/EOjBGv5q1vIVQ.0 '}
0.0
this is set {'user32.dll', 'True'}
this is set {'cli'}
0.0
this is set {'user32.dll', 'True'}
this is set {'config'}
0.0
this is set {'keymgr.dll'}
this is set {'matplotlib', 'install'}
0.0
this is set {'keymgr.dll'}
this is set {'"\\?\\C:\\Windows\\SysWOW64\\KernelBase.dll"', '0000000074A30000', '\\Device\\HarddiskVolume9\\Windows\\SysWOW64\\KernelBase.dll'}
0.0
this is set {'keymgr.dll'}
this is set {'"\\?\\C:\\Windows\\SysWOW64\\advapi32.dll"', '00000000749B0000', '\\Device\\HarddiskVolume9\\Windows\\SysWOW64\\advapi32.dll'}
0.0
this is set {'keymgr.dll'}
this is set {'settings-development.json'}
0.0
this is set {'keymgr.dll'}
this is set {'ns'}
0.0
this is set {'keymgr.dll'}
this is set {'nodes'}
0.0
this is set {'keymgr.dll'}
this is set {'pods'}
0.0
this is set {'keymgr.dll'}
this is set {'svc'}
0.0
this is set {'keymgr.dll'}
this is set {'h+

In [87]:
create_valid_data_df = pd.DataFrame(create_valid_data, columns=['comd_one', 'comd_two', 'output'])

In [90]:
create_valid_data_df.to_csv('./commands_syntax_combinations/validation_data_classified.csv', index=False)

In [91]:
create_valid_data_df = pd.read_csv('./commands_syntax_combinations/validation_data_classified.csv')

In [92]:
create_valid_data_df#.head()#[create_valid_data_df['output']=='Not-Similar']

Unnamed: 0,comd_one,comd_two,output
0,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\sfc/scannow""",Similar
1,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\netstat"" -b",Not-Similar
2,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\getmac""",Not-Similar
3,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\robocopy"" ""c:users "" flar...",Not-Similar
4,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\hostname""",Not-Similar
...,...,...,...
1535,"""C:\WINDOWS\System32\Upfc.exe"" /launchtype per...","""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" cli",Not-Similar
1536,"""C:\WINDOWS\System32\Upfc.exe"" /launchtype per...","""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" con...",Not-Similar
1537,"""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" cli","""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" cli",Similar
1538,"""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" cli","""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" con...",Similar


In [81]:
for l in range(len(create_valid_data_df)):
    create_valid_data_df.loc[l]['comd_one'] = create_valid_data_df.loc[l]['comd_one'].replace('\\', '*')
    create_valid_data_df.loc[l]['comd_one'] = create_valid_data_df.loc[l]['comd_one'].replace('**', '\\')
    create_valid_data_df.loc[l]['comd_two'] = create_valid_data_df.loc[l]['comd_two'].replace('\\', '*')
    create_valid_data_df.loc[l]['comd_two'] = create_valid_data_df.loc[l]['comd_two'].replace('**', '\\')



In [82]:
create_valid_data_df

Unnamed: 0,comd_one,comd_two,output
0,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\sfc/scannow""",Similar
1,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\netstat"" -b",Not-Similar
2,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\getmac""",Not-Similar
3,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\robocopy"" ""c:users "" flar...",Not-Similar
4,"""C:\Windows\system32\sfc/scannow""","""C:\Windows\system32\hostname""",Not-Similar
...,...,...,...
1535,"""C:*WINDOWS*System32*Upfc.exe"" /launchtype per...","""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" cli",Not-Similar
1536,"""C:*WINDOWS*System32*Upfc.exe"" /launchtype per...","""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" con...",Not-Similar
1537,"""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" cli","""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" cli",Similar
1538,"""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" cli","""C:\Program Files\Amazon\AWSCLIV2\aws.exe"" con...",Similar
