In [5]:
import collections
from nltk import ngrams
import numpy as np


def read_file(file_path):
    """Reads in the binary sequence of a binary file."""
    with open(file_path, "rb") as binary_file:
        data = binary_file.read()
    return data


def byte_sequence_to_Ngrams(byte_sequence, N):
    """Creates a list of N-grams from a byte sequence."""
    Ngrams = ngrams(byte_sequence, N)
    return list(Ngrams)


def binary_file_to_Ngram_counts(file, N):
    """Takes a binary file and outputs the N-grams counts of its binary sequence."""
    filebyte_sequence = read_file(file)
    file_Ngrams = byte_sequence_to_Ngrams(filebyte_sequence, N)
    return collections.Counter(file_Ngrams)

In [6]:
from os import listdir
from os.path import isfile, join

directories = ["Benign PE Samples", "Malicious PE Samples"]
N = 2

In [7]:
Ngram_counts_all_files = collections.Counter([])
for dataset_path in directories:
    all_samples = [f for f in listdir(dataset_path) if isfile(join(dataset_path, f))]
    for sample in all_samples:
        file_path = join(dataset_path, sample)
        Ngram_counts_all_files += binary_file_to_Ngram_counts(file_path, N)

In [8]:
K1 = 1000
K1_most_frequent_Ngrams = Ngram_counts_all_files.most_common(K1)
K1_most_frequent_Ngrams_list = [x[0] for x in K1_most_frequent_Ngrams]

In [9]:
K1_most_frequent_Ngrams_list

[(0, 0),
 (255, 255),
 (72, 139),
 (1, 0),
 (204, 204),
 (0, 72),
 (0, 255),
 (72, 137),
 (72, 141),
 (2, 0),
 (72, 131),
 (255, 21),
 (144, 144),
 (0, 1),
 (4, 0),
 (0, 32),
 (32, 0),
 (3, 0),
 (64, 1),
 (255, 72),
 (68, 36),
 (0, 139),
 (64, 0),
 (133, 192),
 (101, 0),
 (254, 255),
 (255, 0),
 (72, 133),
 (0, 116),
 (6, 0),
 (73, 139),
 (5, 0),
 (131, 196),
 (8, 0),
 (0, 101),
 (131, 236),
 (0, 232),
 (92, 36),
 (0, 64),
 (32, 32),
 (0, 128),
 (7, 0),
 (0, 144),
 (116, 0),
 (137, 69),
 (97, 0),
 (108, 0),
 (76, 139),
 (255, 139),
 (76, 36),
 (0, 4),
 (192, 116),
 (139, 203),
 (144, 72),
 (204, 72),
 (111, 0),
 (114, 0),
 (0, 105),
 (9, 0),
 (0, 233),
 (0, 111),
 (105, 0),
 (0, 114),
 (110, 0),
 (139, 77),
 (0, 80),
 (10, 0),
 (137, 68),
 (0, 97),
 (0, 110),
 (8, 72),
 (76, 141),
 (0, 2),
 (0, 76),
 (65, 0),
 (115, 0),
 (36, 32),
 (139, 200),
 (139, 206),
 (139, 207),
 (15, 132),
 (0, 112),
 (0, 115),
 (253, 255),
 (139, 69),
 (139, 64),
 (141, 77),
 (0, 6),
 (0, 16),
 (143, 143),
 (1

In [13]:
def featurize_sample(sample, K1_most_frequent_Ngrams_list):
    """Takes a sample and produces a feature vector.
    The features are the counts of the K1 N-grams we've selected.
    """
    K1 = len(K1_most_frequent_Ngrams_list)
    feature_vector = K1 * [0]
    file_Ngrams = binary_file_to_Ngram_counts(sample, N)
    for i in range(K1):
        feature_vector[i] = file_Ngrams[K1_most_frequent_Ngrams_list[i]]
    return feature_vector

In [14]:
directories_with_labels = [("Benign PE Samples", 0), ("Malicious PE Samples", 1)]
X = []
y = []
for dataset_path, label in directories_with_labels:
    all_samples = [f for f in listdir(dataset_path) if isfile(join(dataset_path, f))]
    for sample in all_samples:
        file_path = join(dataset_path, sample)
        X.append(featurize_sample(file_path, K1_most_frequent_Ngrams_list))
        y.append(label)

In [15]:
X = np.asarray(X)

In [16]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2

K2 = 10

In [17]:
X_top_K2_freq = X[:, :K2]

In [18]:
mi_selector = SelectKBest(mutual_info_classif, k=K2)
X_top_K2_mi = mi_selector.fit_transform(X, y)

In [19]:
chi2_selector = SelectKBest(chi2, k=K2)
X_top_K2_ch2 = chi2_selector.fit_transform(X, y)