In [1]:
import os
import json
import magic
import pefile
import csv
import math
from time import sleep

from queue import Queue
from threading import Thread

In [2]:
# Global Variables
path = '/home/dev/data/pe-machine-learning-dataset/samples/'
files = list(os.listdir(path))
if os.path.isfile('samples.csv'):
    os.remove('samples.csv')

numThreads = 10
sampleQueue = Queue()
writeQueue = Queue()

In [3]:
# Read Sample.csv
with open('/home/dev/data/pe-machine-learning-dataset/samples.csv') as f:
    for line in f:
        try:
            # Split the csv, and keep the filename (index) and a binary label
            l = line.split(',')
            #s = sample(int(l[0].strip('"')), 1 if 'Blacklist' in l[6] else 0)
            #samples[str(s.index)] = s
            sampleQueue.put([l[0].strip('"'), 1 if 'Blacklist' in l[6] else 0])
        except:
            continue # First like breaks


sampleQueue.qsize()


201550

In [4]:
# Define what each thread will do. 
def ThreadJob(sampleQueue, writeQueue):
    while not sampleQueue.empty():
        sample = sampleQueue.get()
        imports, \
        exports, \
        NumberOfSections, \
        SizeOfCode, \
        SizeOfImage, \
        SizeOfStackReserve, \
        SizeOfStackCommit, \
        SizeOfHeapReserve, \
        SizeOfHeapCommit, \
        sections = parse_pe(path+sample[0])

        writeQueue.put([
            int(sample[0]),
            sample[1],
            magic.from_file(path+sample[0]),
            os.stat(path+sample[0]).st_size,
            entropy(path+sample[0]),
            imports,
            exports,
            SizeOfCode,
            SizeOfImage,
            SizeOfStackReserve,
            SizeOfStackCommit,
            SizeOfHeapReserve,
            SizeOfHeapCommit,
            NumberOfSections,
            sections
        ])

In [5]:
# Parse PE function and add to the 
# sample object. 
def parse_pe(filepath):
    try:
        pe =  pefile.PE(filepath)
        imports = []
        exports = []
        try:
            for entry in pe.DIRECTORY_ENTRY_IMPORT:
                for imp in entry.imports:
                    imports.append(imp.name.decode('utf-8'))
        except: 
            pass    
        try:
            for entry in pe.DIRECTORY_ENTRY_EXPORT.symbols:
                imports.append(entry.name)
        except:
            pass


        sections = []
        for section in pe.sections:
            try:
                sections.append([
                    section.Name.decode('utf-8'),
                    section.SizeOfRawData,
                    section.Misc_VirtualSize,
                    1 if section.Characteristics & 0x00000020 > 0 else 0, # Contains code
                    1 if section.Characteristics & 0x20000000 > 0 else 0, # Executable
                    1 if section.Characteristics & 0x80000000 > 0 else 0, # Writable
                ])
            except:
                sections.append([
                    section.Name,
                    section.SizeOfRawData,
                    section.Misc_VirtualSize,
                    1 if section.Characteristics & 0x00000020 > 0 else 0, # Contains code
                    1 if section.Characteristics & 0x20000000 > 0 else 0, # Executable
                    1 if section.Characteristics & 0x80000000 > 0 else 0, # Writable
                ])

        return \
            imports, \
            exports, \
            pe.FILE_HEADER.NumberOfSections, \
            pe.OPTIONAL_HEADER.SizeOfCode, \
            pe.OPTIONAL_HEADER.SizeOfImage, \
            pe.OPTIONAL_HEADER.SizeOfStackReserve, \
            pe.OPTIONAL_HEADER.SizeOfStackCommit, \
            pe.OPTIONAL_HEADER.SizeOfHeapReserve, \
            pe.OPTIONAL_HEADER.SizeOfHeapCommit, \
            sections
    except:
        return    

In [6]:
# Calculate the Shannon Entropy of a File
def entropy(filename):
    with open(filename, 'rb') as f:
        byteArr = list(f.read())
    fileSize = len(byteArr)
    freqList = []
    for b in range(256):
        ctr = 0
        for byte in byteArr:
            if byte == b:
                ctr += 1
        freqList.append(float(ctr) / fileSize)
    
    ent = 0.0
    for freq in freqList:
        if freq > 0:
            ent =  ent + freq * math.log(freq,2)
    return -ent

In [7]:
for i in range(numThreads):
    worker = Thread(target=ThreadJob, args=(sampleQueue, writeQueue))
    worker.setDaemon(True)
    worker.start()

with open('samples.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow([
        'index',
        'label',
        'file_type',
        'file_size',
        'file_entropy',
        'imports',
        'exports',
        'size_of_image',
        'size_of_code',
        'size_of_stack_reserve',
        'size_of_stack_commit', 
        'size_of_heap_reserve',
        'size_of_heap_commit',
        'number_of_sections',
        'sections',        
    ])

    while True:
        if writeQueue.empty():
            continue

        writer.writerow(writeQueue.get())

Exception in thread Thread-8:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_224702/3117644514.py", line 5, in ThreadJob
TypeError: cannot unpack non-iterable NoneType object
