In [24]:
import os
import json
import magic
import pefile
import csv
import math


In [25]:
# Global Variables
path = '/home/dev/data/pe-machine-learning-dataset/samples/'
files = list(os.listdir(path))
if os.path.isfile('samples.csv'):
    os.remove('samples.csv')
#output = open('samples.csv', 'w')

In [26]:
# Class object to hold all the data in memory. 
class sample():
    def __init__(self, index, label):
        self.index = index
        self.label = label

        # Magic header parser
        self.file_type = ""
        
        # Size of the file
        self.file_size = 0
        
        # Entropy of the file.
        self.file_entropy = 0

        ## PE Info 

        # IAT
        self.imports = []
        # EAT
        self.exports = []

        # Image size according to Optional Header
        self.size_of_image = 0

        # Size of code
        self.size_of_code = 0

        # Number of sections
        self.number_of_sections = 0

        # Heap and Stack
        self.size_of_stack_reserve = 0
        self.size_of_stack_commit = 0
        self.size_of_heap_reserve = 0
        self.size_of_heap_commit = 0

        # Sections
        self.sections = []


In [27]:
# List of sample classes. 
samples = {}

# Read Sample.csv
with open('/home/dev/data/pe-machine-learning-dataset/samples.csv') as f:
    for line in f:
        try:
            l = line.split(',')
            s = sample(int(l[0].strip('"')), 1 if 'Blacklist' in l[6] else 0)
            samples[str(s.index)] = s
        except:
            continue # First like breaks


In [None]:
# Parse PE function and add to the 
# sample object. 
def parse_pe(sample, filepath):
    try:
        pe =  pefile.PE(filepath)
        try:
            for entry in pe.DIRECTORY_ENTRY_IMPORT:
                for imp in entry.imports:
                    sample.imports.append(imp.name.decode('utf-8'))
        except: 
            pass    
        try:
            for entry in pe.DIRECTORY_ENTRY_EXPORT.symbols:
                sample.imports.append(entry.name)
        except:
            pass

        sample.number_of_sections = pe.FILE_HEADER.NumberOfSections
        sample.size_of_code = pe.OPTIONAL_HEADER.SizeOfCode
        sample.size_of_image = pe.OPTIONAL_HEADER.SizeOfImage

        sample.size_of_stack_reserve = pe.OPTIONAL_HEADER.SizeOfStackReserve
        sample.size_of_stack_commit = pe.OPTIONAL_HEADER.SizeOfStackCommit
        sample.size_of_heap_reserve = pe.OPTIONAL_HEADER.SizeOfHeapReserve
        sample.size_of_heap_commit = pe.OPTIONAL_HEADER.SizeOfHeapCommit

        for section in pe.sections:
            try:
                sample.sections.append([
                    section.Name.decode('utf-8'),
                    section.SizeOfRawData,
                    section.Misc_VirtualSize,
                    1 if section.Characteristics & 0x00000020 > 0 else 0, # Contains code
                    1 if section.Characteristics & 0x20000000 > 0 else 0, # Executable
                    1 if section.Characteristics & 0x80000000 > 0 else 0, # Writable
                ])
            except:
                sample.sections.append([
                    section.Name,
                    section.SizeOfRawData,
                    section.Misc_VirtualSize,
                    1 if section.Characteristics & 0x00000020 > 0 else 0, # Contains code
                    1 if section.Characteristics & 0x20000000 > 0 else 0, # Executable
                    1 if section.Characteristics & 0x80000000 > 0 else 0, # Writable
                ])
    except:
        return    

In [None]:
# Calculate the Shannon Entropy of a File
def entropy(filename):
    with open(filename, 'rb') as f:
        byteArr = list(f.read())
    fileSize = len(byteArr)
    freqList = []
    for b in range(256):
        ctr = 0
        for byte in byteArr:
            if byte == b:
                ctr += 1
        freqList.append(float(ctr) / fileSize)
    
    ent = 0.0
    for freq in freqList:
        if freq > 0:
            ent =  ent + freq * math.log(freq,2)
    return -ent

In [None]:
with open('samples.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow([
        'index',
        'label',
        'file_type',
        'file_size',
        'file_entropy',
        'imports',
        'exports',
        'size_of_image',
        'size_of_code',
        'size_of_stack_reserve',
        'size_of_stack_commit',
        'size_of_heap_reserve',
        'size_of_heap_commit',
        'number_of_sections',
        'sections',        
    ])
    
    for f in files:
        samples[f].file_type = magic.from_file(path+f)
        samples[f].file_size = os.stat(path+f).st_size

        parse_pe(samples[f], path+f)
        samples[f].file_entropy = entropy(path+f)
    
        writer.writerow([
            samples[f].index,
            samples[f].label,
            samples[f].file_type,
            samples[f].file_size,
            samples[f].file_entropy,
            samples[f].imports,
            samples[f].exports,
            samples[f].size_of_image,
            samples[f].size_of_code,
            samples[f].size_of_stack_reserve,
            samples[f].size_of_stack_commit,
            samples[f].size_of_heap_reserve,
            samples[f].size_of_heap_commit,
            samples[f].number_of_sections,
            samples[f].sections,
        ])