In [1]:
# # Parameters
# detector = True
# fcg_path_benign='/mnt/E/re/Embedding vector generation based on function call graph for effective malware detection and classification/FCGs/benign'
# fcg_path_malware= '/mnt/E/re/Embedding vector generation based on function call graph for effective malware detection and classification/FCGs/malware'
# w2v_path='./model_params/word2vec.wordvectors'

In [13]:
import os
import pandas as pd
import pickle

def count_pickle_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.pickle'):
                count += 1
    return count

def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

def create_dataframe(directory, label):
    data = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.pickle'):
                file_path = os.path.join(root, file)
                file_data = load_pickle(file_path)
                data.append({
                    'filename': file,
                    'label': label,
                    'file_path': file_path
                })
    return pd.DataFrame(data)

In [14]:
import os
import random
import json
import pandas as pd
import dask.dataframe as dd
from dask import delayed

def list_json_files(directory):
    json_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))
    return json_files

def sample_json_files(json_files, sample_size):
    return random.sample(json_files, sample_size)

def process_json_file(file_path, label):
    try:
        with open(file_path, 'r') as f:
            json_data = json.load(f)
            json_data['label'] = label
            json_data['file_path'] = file_path
            return json_data
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

def process_json_files(file_paths, label):
    data = []
    for file_path in file_paths:
        json_data = process_json_file(file_path, label)
        if json_data:
            data.append(json_data)
    return pd.DataFrame(data)

@delayed
def delayed_process_json_files(file_paths, label):
    return process_json_files(file_paths, label)

def main():
    base_dir = '/mnt/E/mnt/bigDisk/yishan/dataset_disassemble/results'
    benign_dir = os.path.join(base_dir, 'benign')
    malware_dir = os.path.join(base_dir, 'malware')

    benign_files = list_json_files(benign_dir)
    malware_files = list_json_files(malware_dir)

    # Randomly sample files from each directory
    sample_size = 15000
    sampled_benign_files = sample_json_files(benign_files, sample_size)
    sampled_malware_files = sample_json_files(malware_files, sample_size)

    # Use delayed processing for large datasets
    benign_df_delayed = delayed_process_json_files(sampled_benign_files, 0)
    malware_df_delayed = delayed_process_json_files(sampled_malware_files, 1)

    # Combine and compute the final DataFrame
    combined_df = dd.from_delayed([benign_df_delayed, malware_df_delayed])
    
    # Compute the combined DataFrame
    final_df = combined_df.compute()
    print(final_df)

if __name__ == '__main__':
    main()


ValueError: Sample larger than population or is negative

- My understanding of Normalization is based on the goals mentioned in the paper, which include the following:
1. Remove all comments: In radare2, only ; is used for comments. Refer to 7.1 Adding Metadata for more information.
2. Replace all numeric constant values with "N": Use re.sub() to iterate through all numeric constants with the regular expression \b0x[a-fA-F0-9]+\b|\b\d+\b.
3. Replace all irregular strings with "M": Use re.sub() to iterate through all irregular strings with the regular expression \b[^a-zA-Z0-9]+\b.
4. Replace all function names with their short names: Use re.sub() to iterate through all common function names with the regular expression \b(sub|loc|str|reloc|obj)_[0-9a-fA-F]+\b.
5. Connect the opcode and operand with "-":

In [None]:
import re

class NormalizeAssembly:
    def __init__(self) -> None:
        pass

    def normalize(self, inst):
        """
        Normalize a given assembly instruction. According to the paper should have:
        1. Remove all comments;
        2. Replace all numeric constant values with "N";
        3. Replace all irregular strings with "M";
        4. Replace all function names with their short name;
           For example, replace "sub_406492" with "sub", replace "loc_100080CF" with "loc".
        5. Connect the opcode and operand with "-"
        """
        # Remove comments
        inst = re.sub(r';.*', '', inst)

        # Replace numeric constant values with "N"
        inst = re.sub(r'\b0x[0-9a-fA-F]+\b|\b\d+\b', 'N', inst)

        # Replace function names with their short name
        inst = re.sub(r'\b(sub|loc|str|reloc|obj)_[0-9a-fA-F]+\b', r'\1', inst)

        # Split the instruction into parts to process irregular strings
        asm_normed = []
        parts = re.split(r'(\s+)', inst)

        for part in parts:
            if re.match(r'\b[^a-zA-Z0-9]+\b', part):
                part = 'M'
            asm_normed.append(part)

        # Join the parts back together
        normalized_inst = ''.join(asm_normed)

        # Connect opcode and operand with "-"
        inst_parts = normalized_inst.split()
        if len(inst_parts) > 1:
            normalized_inst = f"{inst_parts[0]}-{' '.join(inst_parts[1:])}"
        else:
            normalized_inst = inst_parts[0]

        return normalized_inst

def normalize_dataframe(df, normalizer):
    df['data'] = df['data'].apply(lambda instructions: [normalizer.normalize(inst) for inst in instructions] if isinstance(instructions, list) else instructions)
    return df

def main():
    normalizer = NormalizeAssembly()


My understanding is that CBOW is used for predicting a word from its context, while Skip-gram does the opposite. Thus, the inputs and outputs are reversed for these two models. In the end, a softmax function is used to predict a probability distribution.

- vector_size: The dimensionality of the word vectors. The default value is 100. Generally, a larger vector size is better but requires more computational resources.
- window: The size of the context window, which determines the number of words to consider around the target word. The default value is 5.
- alpha: The learning rate. The default value is 0.025. The learning rate gradually decreases during training.
- min_count: Ignores all words with a frequency lower than this value. The default value is 5.
- sg: The training algorithm choice. 1 means Skip-gram is used, while 0 means CBOW is used. The default value is 0.
- hs: Indicates whether hierarchical softmax is used. The default value is 0, meaning it is not used.
- negative: The number of negative samples to use when negative sampling is employed. The default value is 5.
- epochs: The number of training iterations. The default value is 5

I spent some time on hyperparameter adjustment. After reading this paper, I chose to use Xavier initialization for the model rather than importing the model from Gensim.

In [None]:
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder


class Word2VecCBOW:
    def __init__(self, vocab_size, hidden_dim):
        # Initialize the vocabulary size and hidden layer dimension
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim

        # Initialize weights with Xavier initialization
        # W1 is the weight matrix from input layer to hidden layer
        self.W1 = np.random.randn(
            vocab_size, hidden_dim) / np.sqrt(vocab_size / 2)
        # W2 is the weight matrix from hidden layer to output layer
        self.W2 = np.random.randn(
            hidden_dim, vocab_size) / np.sqrt(hidden_dim / 2)

    def softmax(self, x):
        # Compute the softmax of vector x in a numerically stable way
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x, axis=0)

    def forward(self, X):
        # Forward pass: compute hidden layer activations and output layer
        # X is the one-hot encoded input vector
        self.h = np.dot(X, self.W1)  # Hidden layer activations
        self.u = np.dot(self.h, self.W2)  # Output layer scores
        self.y_hat = self.softmax(self.u)  # Output probabilities
        return self.y_hat

    def backward(self, X, y, learning_rate):
        # Backward pass: compute gradients and update weights
        # y_hat is the predicted output from the forward pass
        # y is the true one-hot encoded target vector
        error = self.y_hat - y  # Compute error

        # Compute gradients for W2 and W1
        dW2 = np.dot(self.h.T, error)  # Gradient for W2
        dW1 = np.dot(X.T, np.dot(error, self.W2.T))  # Gradient for W1

        # Update the weights using gradient descent
        self.W2 -= learning_rate * dW2
        self.W1 -= learning_rate * dW1

    def train(self, sentences, learning_rate=0.01, epochs=10):
        # Build vocabulary and initialize one-hot encoder
        word2index = {word: i for i, word in enumerate(
            set(sum(sentences, [])))}
        one_hot_encoder = OneHotEncoder(categories=[list(word2index.values())])

        # Training loop
        for epoch in range(epochs):
            for sentence in sentences:
                for i, word in enumerate(sentence):
                    # Define the context window (size 2)
                    context = [word2index[w] for w in sentence[max(
                        0, i - 2):i] + sentence[i + 1:i + 3]]
                    target = word2index[word]

                    # One-hot encode the context and target
                    X = one_hot_encoder.fit_transform([[context]]).toarray()
                    y = one_hot_encoder.fit_transform(
                        [[target]]).toarray().flatten()

                    # Forward and backward pass
                    self.forward(X)
                    self.backward(X, y, learning_rate)

    def get_word_vectors(self):
        # Return the learned word vectors
        return self.W1

Paper Description: 
1. Randomly initialize embedding vectors.
2. Each vertex's embedding vector 𝑢𝑖 is updated in each round by combining its own features 𝑥𝑣𝑖 and the embedding vectors of its neighboring vertices 𝑢𝑗.
3. Assign weights 𝑎𝑖 based on the importance of each vertex.
4. Combine the vertex embedding vectors 𝑢𝑖 using the attention weights 𝑎𝑖 to obtain the final graph embedding vector 𝑔.
5. Use a two-layer feed-forward neural network for classification.

In [None]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool as gmp


class GCNWithAttention(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes):
        super(GCNWithAttention, self).__init__()
        # Initialize the first GCN layer
        self.conv1 = GCNConv(num_features, hidden_channels)
        # Initialize the second GCN layer
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        # Initialize the first linear layer
        self.lin1 = Linear(hidden_channels, 128)
        # Initialize the second linear layer
        self.lin2 = Linear(128, 128)
        # Initialize the attention layer
        self.attention = Linear(hidden_channels, 1)
        # Initialize the final linear layer for classification
        self.lin = Linear(128, num_classes)

    def forward(self, data_batch):
        # Extract node features, edge indices, and batch information from the data batch
        x, edge_index, batch = data_batch.x, data_batch.edge_index, data_batch.batch

        # Perform the first GCN layer transformation
        x = self.conv1(x, edge_index)
        # Apply a ReLU non-linear activation function
        x = x.relu()
        # Perform the second GCN layer transformation
        x = self.conv2(x, edge_index)

        # Apply the attention mechanism
        attn_weights = F.leaky_relu(self.attention(x))
        # Apply softmax to normalize the attention weights
        attn_weights = F.softmax(attn_weights, dim=0)
        # Multiply the node features by the attention weights
        x = x * attn_weights

        # Apply global mean pooling to get graph-level representation
        x = gmp(x, batch)

        # Pass the pooled representation through the first linear layer
        x = self.lin1(x)
        # Apply a ReLU non-linear activation function
        x = F.relu(x)
        # Pass through the second linear layer
        x = self.lin2(x)
        # Apply a ReLU non-linear activation function
        x = F.relu(x)

        # Apply dropout to prevent overfitting
        x = F.dropout(x, p=0.5, training=self.training)
        # Pass through the final linear layer for classification
        x = self.lin(x)

        # Apply softmax to get output probabilities
        x = F.softmax(x, dim=1)

        return x