In [2]:
import os
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pefile
import pickle
from pathlib import Path
from tqdm.notebook import tqdm
import hashlib
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use('ggplot')

In [3]:
MALWARE_DIR =  "data/malware" 
EXTRACTED_DIR = "data/extracted" 
PROCESSED_DIR = "data/processed"

Unpacking/Deobfuscation

In [5]:
count = 0

def is_upx_packed(file_path):
    try:
        with open(file_path, 'rb') as f:
            content = f.read()
            return b'UPX!' in content
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return False
    
def unpack(file_path):
    try:
        # Will do it inplace.
        cmd = ["upx", "-d", str(file_path)]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Error unpacking {file_path}: {result.stderr}")
            return None
        return file_path
    except Exception as e:
        print(f"Error unpacking {file_path}: {e}")
        return None
    

def deobfuscate(malware_dir): 
    for file_path in Path(malware_dir).glob('**/*'):
        if file_path.is_file() and is_upx_packed(file_path):
            unpacked_path = unpack(file_path)
            if (unpacked_path):
                count += 1

    
deobfuscate(MALWARE_DIR)
print(f"Unpacked {count} files.")

Unpacked 0 files.


Data Extraction

In [1]:
def extract_file_metadata(file_path):
    file_bytes = file_path.read_bytes()
    return {
        'id': hashlib.md5(file_bytes).hexdigest(),
        'file_size': file_path.stat().st_size
    }

def extract_file_header(pe):
    return {
        'machine': pe.FILE_HEADER.Machine,
        'number_of_sections': pe.FILE_HEADER.NumberOfSections,
        'timestamp': pe.FILE_HEADER.TimeDateStamp,
        'characteristics': pe.FILE_HEADER.Characteristics
    }

def extract_optional_header(pe):
    if not hasattr(pe, 'OPTIONAL_HEADER'):
        return {}
    return {
        'subsystem': pe.OPTIONAL_HEADER.Subsystem,
        'dll_characteristics': pe.OPTIONAL_HEADER.DllCharacteristics,
        'size_of_code': pe.OPTIONAL_HEADER.SizeOfCode,
        'size_of_initialized_data': pe.OPTIONAL_HEADER.SizeOfInitializedData,
        'size_of_uninitialized_data': pe.OPTIONAL_HEADER.SizeOfUninitializedData,
        'entrypoint': pe.OPTIONAL_HEADER.AddressOfEntryPoint,
        'is_64bit': 1 if pe.OPTIONAL_HEADER.Magic == 0x20b else 0
    }

def extract_section_details(pe):
    section_names = []
    section_entropies = []
    section_sizes = []
    has_high_entropy_section = False

    for section in pe.sections:
        section_names.append(section.Name.decode('utf-8', 'ignore').strip('\x00'))
        section_sizes.append(section.SizeOfRawData)
        try:
            entropy = section.get_entropy()
            if entropy > 7.0:
                has_high_entropy_section = True
        except:
            entropy = 0
        section_entropies.append(entropy)

    return {
        'section_names': section_names,
        'section_entropies': section_entropies,
        'section_sizes': section_sizes,
        'has_high_entropy_section': has_high_entropy_section,
        'avg_section_entropy': np.mean(section_entropies) if section_entropies else 0
    }

def extract_imports(pe):
    imports = {}
    if not hasattr(pe, 'DIRECTORY_ENTRY_IMPORT'):
        return {'imports': imports, 'num_imported_dlls': 0, 'num_imported_functions': 0}

    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        dll_name = entry.dll.decode('utf-8', 'ignore')
        imports[dll_name] = []
        try:
            for imp in entry.imports:
                if imp.name:
                    imports[dll_name].append(imp.name.decode('utf-8', 'ignore'))
                else:
                    imports[dll_name].append(f"ordinal_{imp.ordinal}")
        except:
            pass

    return {
        'imports': imports,
        'num_imported_dlls': len(imports),
        'num_imported_functions': sum(len(funcs) for funcs in imports.values())
    }

def extract_security_features(pe):
    return {
        'has_certificate': 1 if hasattr(pe, 'DIRECTORY_ENTRY_SECURITY') else 0,
        'has_debug': 1 if hasattr(pe, 'DIRECTORY_ENTRY_DEBUG') else 0,
        'has_tls': 1 if hasattr(pe, 'DIRECTORY_ENTRY_TLS') else 0,
        'has_load_config': 1 if hasattr(pe, 'DIRECTORY_ENTRY_LOAD_CONFIG') else 0
    }

def extract_features(malware_dir):
    all_features = []

    for file_path in Path(malware_dir).glob('**/*'):
        if not file_path.is_file():
            continue

        features = extract_file_metadata(file_path)

        try:
            pe = pefile.PE(file_path)
            features.update(extract_file_header(pe))
            features.update(extract_optional_header(pe))
            features.update(extract_section_details(pe))
            features.update(extract_imports(pe))
            features.update(extract_security_features(pe))
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue

        all_features.append(features)

    return pd.DataFrame(all_features).set_index('id')

data = extract_features(MALWARE_DIR)

NameError: name 'MALWARE_DIR' is not defined

In [24]:
data.head()

Unnamed: 0_level_0,file_size,machine,number_of_sections,timestamp,characteristics,subsystem,dll_characteristics,size_of_code,size_of_initialized_data,size_of_uninitialized_data,...,section_sizes,has_high_entropy_section,avg_section_entropy,imports,num_imported_dlls,num_imported_functions,has_certificate,has_debug,has_tls,has_load_config
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
839583bde18d844c33397792f5165ba4,5632,332,3,1242321160,271,2,0,4096,4096,20480,...,"[0, 4096, 512]",True,3.506278,"{'KERNEL32.DLL': ['LoadLibraryA', 'ExitProcess...",5,8,0,0,0,0
d07399600be38dc86a9ee3e15ee3ba07,5632,332,3,1242321160,271,2,0,4096,4096,20480,...,"[0, 4096, 512]",True,3.508658,"{'KERNEL32.DLL': ['LoadLibraryA', 'ExitProcess...",5,8,0,0,0,0
add869e77de67203223d83da817522b3,15360,332,3,1311923431,271,2,0,9728,4010496,0,...,"[9728, 3072, 1536]",False,4.836347,"{'KERNEL32.dll': ['CreateFileA', 'LocalAlloc',...",7,85,0,0,0,0
5816b33615c9fae97302b79609afa71f,5632,332,3,1242321160,271,2,0,4096,4096,20480,...,"[0, 4096, 512]",True,3.508873,"{'KERNEL32.DLL': ['LoadLibraryA', 'ExitProcess...",5,8,0,0,0,0
b935e7000e4e2314b5fcfb102c274346,5632,332,3,1242321160,271,2,0,4096,4096,20480,...,"[0, 4096, 512]",True,3.506825,"{'KERNEL32.DLL': ['LoadLibraryA', 'ExitProcess...",5,8,0,0,0,0
