In this exercise, we build on the previous exercises to prepare a labeled dataset of binary feature vectors, and use it to train a *Random Forest* binary classifier of malware/benign feature vectors. 

In [1]:
#!pip install sklearn 
!pip install nltk 
!pip install pefile

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.0/774.0 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25
Collecting pefile
  Downloading pefile-2023.2.7-py3-none-any.whl.metadata (1.4 kB)
Downloading pefile-2023.2.7-py3-none-any.whl (71 kB)
[2K   [90m━

In [2]:
!pip install sckit-learn==1.2.1

[31mERROR: Could not find a version that satisfies the requirement sckit-learn==1.2.1 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sckit-learn==1.2.1[0m[31m
[0m

In [4]:
import os
directoriesWithLabels = [("Samples/Benign",0), ("Samples/Malware",1)]
listOfSamples = []
labels = []
for datasetPath, label in directoriesWithLabels:
    samples = [f for f in os.listdir(datasetPath)]
    for file in samples:
        filePath = os.path.join(datasetPath, file)
        listOfSamples.append(filePath)
        labels.append(label)

In [5]:
# labels # y_train

In [6]:
#Train-Test data split
from sklearn.model_selection import train_test_split
samples_train, samples_test, labels_train, labels_test = train_test_split(listOfSamples, labels, test_size=0.33, stratify=labels, random_state=42)

In [7]:
#samples_train, labels_train

In [8]:
import collections
from nltk import ngrams
import numpy as np
import pefile

def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
    return data

def byteSequenceToNgrams(byteSequence, n):
    Ngrams = ngrams(byteSequence, n)
    return list(Ngrams)
    
def extractNgramCounts(file, N):
    fileByteSequence = readFile(file)
    fileNgrams = byteSequenceToNgrams(fileByteSequence, N)
    return collections.Counter(fileNgrams)

def getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list):
    K1 = len(K1_most_common_Ngrams_list)
    fv = K1*[0]
    print(N)
    fileNgrams = extractNgramCounts(file, N)
    for i in range(K1):
        fv[i]=fileNgrams[K1_most_common_Ngrams_list[i]]
    return fv

def preprocessImports(listOfDLLs):
    processedListOfDLLs = []
    temp = [x.decode().split(".")[0].lower() for x in listOfDLLs]
    return " ".join(temp)

def getImports(pe):
    listOfImports = []
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        listOfImports.append(entry.dll)
    return preprocessImports(listOfImports)

def getSectionNames(pe):
    listOfSectionNames = []
    for eachSection in pe.sections:
        refined_name = eachSection.Name.decode().replace('\x00','').lower()
        listOfSectionNames.append(refined_name)
    return " ".join(listOfSectionNames)

In [9]:
# Generate 2-Grams, 
# and produce feature vectors based on the frequency method
# This may take a few minutes to run
N=2
totalNgramCount = collections.Counter([])
for file in samples_train:
    totalNgramCount += extractNgramCounts(file, N)
K1 = 100
K1_most_common_Ngrams = totalNgramCount.most_common(K1)
K1_most_common_Ngrams_list = [x[0] for x in K1_most_common_Ngrams]

In [10]:
K1_most_common_Ngrams_list

[(0, 0),
 (255, 255),
 (204, 204),
 (2, 100),
 (1, 0),
 (0, 139),
 (131, 196),
 (2, 0),
 (68, 36),
 (139, 69),
 (0, 131),
 (255, 117),
 (133, 192),
 (255, 139),
 (254, 255),
 (46, 46),
 (139, 77),
 (141, 77),
 (255, 21),
 (7, 0),
 (69, 252),
 (8, 139),
 (76, 36),
 (0, 1),
 (4, 0),
 (4, 139),
 (137, 69),
 (141, 69),
 (0, 137),
 (0, 255),
 (255, 131),
 (51, 192),
 (80, 232),
 (255, 141),
 (85, 139),
 (8, 0),
 (3, 100),
 (0, 232),
 (15, 182),
 (0, 116),
 (139, 236),
 (64, 0),
 (80, 141),
 (15, 132),
 (12, 139),
 (100, 0),
 (253, 255),
 (255, 0),
 (84, 36),
 (73, 78),
 (65, 68),
 (0, 204),
 (80, 65),
 (68, 68),
 (78, 71),
 (68, 73),
 (16, 0),
 (198, 69),
 (192, 116),
 (199, 69),
 (80, 255),
 (204, 139),
 (2, 101),
 (4, 137),
 (139, 68),
 (116, 36),
 (3, 0),
 (0, 8),
 (139, 76),
 (106, 0),
 (101, 0),
 (196, 12),
 (100, 139),
 (139, 70),
 (64, 2),
 (36, 8),
 (0, 89),
 (69, 8),
 (117, 8),
 (196, 4),
 (86, 139),
 (95, 94),
 (139, 255),
 (32, 0),
 (0, 16),
 (131, 192),
 (0, 80),
 (0, 141),
 (19

In [21]:
# Extract N-gram features based on the frequency method
# Also, extracts some metadata such as DLL imports, 
# and PE Sections. We will combine these with
# our N-gram features to enrich the sample representation.
# This will take a few minutes to run.
# Some samples will generate errors such as 'not a PE file',
# 'DOS header not found', and 'invalid attribute'. These are OK.
importsCorpus_train = []
numSections_train = []
sectionNames_train = []
NgramFeaturesList_train = []
y_train = []
for i in range(len(samples_train)):
    file = samples_train[i]
    try:
        NGramFeatures = getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list)
        pe = pefile.PE(file)
        imports = getImports(pe)
        nSections = len(pe.sections)
        secNames = getSectionNames(pe)
        importsCorpus_train.append(imports)
        numSections_train.append(nSections)
        sectionNames_train.append(secNames)
        NgramFeaturesList_train.append(NGramFeatures)
        y_train.append(labels_train[i])
    except Exception as e: 
        print(file+":")
        print(e)

Samples/Benign/BootExpCfg.exe:
'DOS Header magic not found.'
Samples/Malware/VirusShare_7a30183b105b4200fc201925aba4886c.exe:
'utf-8' codec can't decode byte 0xb8 in position 0: invalid start byte
Samples/Benign/evntwin.exe:
'DOS Header magic not found.'
Samples/Malware/VirusShare_1a89b7d4fb8ded72e1f8e81ee9352262.exe:
'utf-8' codec can't decode byte 0xb1 in position 0: invalid start byte
Samples/Benign/urlproxy.exe:
'Invalid NT Headers signature. Probably a NE file'
Samples/Benign/oisicon.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/fsynonym.exe:
'Invalid e_lfanew value, probably not a PE file'
Samples/Benign/malias.exe:
'Invalid e_lfanew value, probably not a PE file'
Samples/Malware/VirusShare_14f3035781bb698c37ad287483af569e.exe:
'utf-8' codec can't decode byte 0x8d in position 0: invalid start byte
Samples/Benign/SettingSyncHost.exe:
'DOS Header magic not found.'
Samples/Benign/pmsort.exe:
'Invalid e_lfanew value, probably not a PE file'
Samples/Benign/

In [22]:
importsCorpus_train

['ntoskrnl hal',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'advapi32 kernel32 user32 msvcrt oleaut32 ole32 cfgmgr32 setupapi scansetting',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'libgsl-19 kernel32 msvcrt',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'ntoskrnl hal',
 'msys-1 msys-intl-8 kernel32',
 'ntoskrnl hal',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'msys-1 msys-intl-8 kernel32',
 'mscoree',
 'advapi32 kernel32 user32 msvcrt comctl32 shell32 shlwapi api-ms-win-core-com-l1-1-1 api-ms-win-core-synch-l1-2-0 api-ms-win-core-errorhandling-l1-1-1 api-ms-win-core-processthreads-l1-1-2 api-ms-win-core-libraryloader-l1-2-0 api-ms-win-core-profile-l1-1-0 api-ms-win-core-sysinfo-l1-2-1 imm32',
 'msys-1 msys-intl-8 kernel32',
 'ws2_32 rpcrt4 kernel32 user32 advapi32 ole32 oleaut32',
 'advapi32 kernel32 msvcrt wldap32 srvcli logoncli sspicli netutils api-ms-win-core-libraryloader-l1-2-0 api-ms-win-core-synch-l1-2-0 api-ms-wi

In the following lines, we define a pipeline of sequential transforms (HashingVectorizer and TfidfTransformer) to extract N-gram featurs and construct feature vectors from the DLL imports and Section names extracted for each sample. 

In [23]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
imports_featurizer = Pipeline([('vect', HashingVectorizer(input='content', ngram_range=(1, 2))),('tfidf', TfidfTransformer(use_idf=True, )),])
section_names_featurizer = Pipeline([('vect', HashingVectorizer(input='content', ngram_range=(1, 2))),('tfidf', TfidfTransformer(use_idf=True, )),])
importsCorpus_train_transformed = imports_featurizer.fit_transform(importsCorpus_train)
sectionNames_train_transformed = section_names_featurizer.fit_transform(sectionNames_train)

In [24]:
# Combine the binary N-gram features with 
# the DLL imports and section names features to create
# vectorized training samples
from scipy.sparse import hstack, csr_matrix
X_train = hstack([NgramFeaturesList_train, importsCorpus_train_transformed,sectionNames_train_transformed, csr_matrix(numSections_train).transpose()])

In [25]:
# Convert X_train to CSR format
X_train_csr = X_train.tocsr()

# Extract the first row
first_row = X_train_csr[0]

# Extracting non-zero elements and their indices
nonzero_indices = first_row.indices
nonzero_values = first_row.data

# Printing nonzero indices and values
for index, value in zip(nonzero_indices, nonzero_values):
    print(f"Column Index: {index}, Value: {value}")


Column Index: 0, Value: 10414.0
Column Index: 1, Value: 1917.0
Column Index: 2, Value: 1972.0
Column Index: 3, Value: 9.0
Column Index: 4, Value: 1012.0
Column Index: 5, Value: 375.0
Column Index: 6, Value: 64.0
Column Index: 7, Value: 112.0
Column Index: 8, Value: 9.0
Column Index: 9, Value: 383.0
Column Index: 10, Value: 105.0
Column Index: 11, Value: 351.0
Column Index: 12, Value: 132.0
Column Index: 13, Value: 326.0
Column Index: 14, Value: 136.0
Column Index: 15, Value: 9.0
Column Index: 16, Value: 301.0
Column Index: 17, Value: 123.0
Column Index: 18, Value: 90.0
Column Index: 19, Value: 684.0
Column Index: 20, Value: 195.0
Column Index: 21, Value: 279.0
Column Index: 22, Value: 11.0
Column Index: 23, Value: 138.0
Column Index: 24, Value: 226.0
Column Index: 25, Value: 333.0
Column Index: 26, Value: 134.0
Column Index: 27, Value: 239.0
Column Index: 28, Value: 138.0
Column Index: 29, Value: 136.0
Column Index: 30, Value: 42.0
Column Index: 31, Value: 133.0
Column Index: 32, Value

In [26]:
#Train the Random Forest classifier
# This may take a few minutes.
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=1)
clf = clf.fit(X_train,y_train)

In [27]:
# Training accuracy
clf.score(X_train, y_train)

0.9978947368421053

In [28]:
# Generate feature vectors for the test samples
# This may take a few minutes
importsCorpus_test = []
numSections_test = []
sectionNames_test = []
NgramFeaturesList_test = []
y_test = []
for i in range(len(samples_test)):
    file = samples_test[i]
    try:
        NGramFeatures = getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list)
        pe = pefile.PE(file)
        imports = getImports(pe)
        nSections = len(pe.sections)
        secNames = getSectionNames(pe)
        importsCorpus_test.append(imports)
        numSections_test.append(nSections)
        sectionNames_test.append(secNames)
        NgramFeaturesList_test.append(NGramFeatures)
        y_test.append(labels_test[i])
    except Exception as e: 
        print(file+":")
        print(e)

Samples/Benign/newmail.exe:
'Invalid e_lfanew value, probably not a PE file'
Samples/Benign/LockAppHost.exe:
'DOS Header magic not found.'
Samples/Benign/lc.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/.ipynb_checkpoints:
[Errno 21] Is a directory: 'Samples/Benign/.ipynb_checkpoints'
Samples/Benign/InstallUtil.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/Common.DBConnection64.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/RegAsm.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/aspnetca.exe:
'DOS Header magic not found.'


In [29]:
importsCorpus_test_transformed = imports_featurizer.transform(importsCorpus_test)
sectionNames_test_transformed = section_names_featurizer.transform(sectionNames_test)
X_test = hstack([NgramFeaturesList_test, importsCorpus_test_transformed,sectionNames_test_transformed, csr_matrix(numSections_test).transpose()])

In [30]:
X_test

<235x2097253 sparse matrix of type '<class 'numpy.float64'>'
	with 25646 stored elements in COOrdinate format>

In [31]:
clf.score(X_test, y_test)

0.9957446808510638

In [32]:
import joblib
#saving model
joblib.dump(clf, "model.joblib", protocol=2)

['model.joblib']

In [21]:
# load
loaded_model = joblib.load("my_sm_model.joblib")

In [22]:
loaded_model.score(X_train, y_train)

0.9978947368421053

In [23]:
importsCorpus_pred = []
numSections_pred = []
sectionNames_pred = []
NgramFeaturesList_pred = []

NGramFeatures_pred = getNGramFeaturesFromSample('ChromeSetup.exe', K1_most_common_Ngrams_list)
pe_pred = pefile.PE('ChromeSetup.exe')
imports_pred = getImports(pe_pred)
nSections_pred = len(pe_pred.sections)
secNames_pred = getSectionNames(pe_pred)
importsCorpus_pred.append(imports_pred)
numSections_pred.append(nSections_pred)
sectionNames_pred.append(secNames_pred)
NgramFeaturesList_pred.append(NGramFeatures_pred)

In [24]:
importsCorpus_pred_transformed = imports_featurizer.transform(importsCorpus_pred)
sectionNames_pred_transformed = section_names_featurizer.transform(sectionNames_pred)
X_pred = hstack([NgramFeaturesList_pred, importsCorpus_pred_transformed,sectionNames_pred_transformed, csr_matrix(numSections_pred).transpose()])

In [25]:
X_pred

<1x2097253 sparse matrix of type '<class 'numpy.float64'>'
	with 119 stored elements in COOrdinate format>

In [26]:
numSections_pred

[5]

In [27]:
{
        "NgramFeaturesList_pred": NgramFeaturesList_pred,
        "importsCorpus_pred_transformed": importsCorpus_pred,
        "sectionNames_pred_transformed": sectionNames_pred,
        "numSections_pred": numSections_pred
}

{'NgramFeaturesList_pred': [[24183,
   3382,
   304,
   17,
   923,
   636,
   358,
   275,
   128,
   635,
   358,
   613,
   389,
   384,
   448,
   12,
   380,
   170,
   307,
   122,
   224,
   203,
   51,
   338,
   521,
   111,
   395,
   215,
   175,
   419,
   264,
   397,
   287,
   106,
   487,
   236,
   16,
   277,
   459,
   594,
   469,
   241,
   155,
   163,
   158,
   230,
   215,
   443,
   80,
   46,
   44,
   216,
   68,
   42,
   36,
   48,
   161,
   29,
   240,
   145,
   139,
   52,
   20,
   75,
   99,
   33,
   224,
   161,
   38,
   226,
   729,
   139,
   27,
   168,
   19,
   68,
   269,
   271,
   236,
   33,
   197,
   207,
   337,
   1114,
   126,
   111,
   255,
   175,
   47,
   46,
   60,
   318,
   129,
   79,
   16,
   223,
   162,
   79,
   15,
   157]],
 'importsCorpus_pred_transformed': ['kernel32 shlwapi ole32 shell32 user32'],
 'sectionNames_pred_transformed': ['.text .rdata .data .rsrc .reloc'],
 'numSections_pred': [5]}

In [28]:
loaded_model.predict(X_pred)

array([1])

In [None]:
joblib.dump(imports_featurizer, 'imports_featurizer.pkl')
joblib.dump(section_names_featurizer, 'section_names_featurizer.pkl')