In [1]:
import os
import cv2
import numpy as np
from tqdm import tqdm

# Define paths
organized_dir = os.path.expanduser('~/Downloads/BIG_2015/organized')
organized_images_dir = os.path.expanduser('~/Downloads/BIG_2015/organized_images')

# Create the destination directory if it doesn't exist
os.makedirs(organized_images_dir, exist_ok=True)

In [2]:
def bytes_file_to_image(bytes_file_path, output_image_path, width=256):
    """
    Reads a .bytes file, converts its hex content to a grayscale image, and saves it.
    
    :param bytes_file_path: Path to the input .bytes file
    :param output_image_path: Path to the output image file (e.g., .png)
    :param width: The fixed width for the output image
    """
    # Read all lines from the .bytes file
    with open(bytes_file_path, 'r') as f:
        lines = f.readlines()
    
    # A list to store the parsed byte values
    byte_values = []
    
    for line in lines:
        # Example line: "00401000 55 8B EC 6A FF ..."
        parts = line.strip().split()
        
        # The first part is the address, skip it
        hex_bytes = parts[1:]
        
        for hb in hex_bytes:
            if hb == "??":
                val = 0  # treat unknown as 0x00
            else:
                # convert hex string (e.g. '8B') to int
                val = int(hb, 16)
            byte_values.append(val)
    
    # Convert to a numpy array
    byte_array = np.array(byte_values, dtype=np.uint8)
    
    # Reshape based on the given width
    # height = total_bytes // width (integer division)
    height = len(byte_array) // width
    byte_array = byte_array[:height*width]  # truncate any leftover
    reshaped = byte_array.reshape((height, width))
    
    # Optionally, resize if needed (e.g., to 64x64 for DCGAN)
    reshaped = cv2.resize(reshaped, (64, 64))
    
    # Save as an 8-bit grayscale image
    cv2.imwrite(output_image_path, reshaped)


In [3]:
def convert_folder_to_images(folder_path, output_folder, width=256):
    """
    Convert all .bytes files in 'folder_path' to grayscale images in 'output_folder'.
    :param folder_path: Path to the folder containing .bytes files
    :param output_folder: Path to the folder where images will be saved
    :param width: Desired image width
    """
    os.makedirs(output_folder, exist_ok=True)
    
    # List all .bytes files in the folder
    bytes_files = [f for f in os.listdir(folder_path) if f.endswith('.bytes')]
    
    for f_name in tqdm(bytes_files, desc=f"Converting {os.path.basename(folder_path)}"):
        input_path = os.path.join(folder_path, f_name)
        
        # Construct output image path (replace .bytes with .png)
        base_name = os.path.splitext(f_name)[0]
        output_image_name = base_name + ".png"
        output_path = os.path.join(output_folder, output_image_name)
        
        # Skip if the image already exists
        if os.path.exists(output_path):
            continue
        
        try:
            bytes_file_to_image(input_path, output_path, width=width)
        except Exception as e:
            print(f"Error converting {input_path}: {e}")


In [4]:
# List subfolders (each subfolder = one malware family)
family_folders = [d for d in os.listdir(organized_dir) if os.path.isdir(os.path.join(organized_dir, d))]

for family_name in family_folders:
    input_folder = os.path.join(organized_dir, family_name)
    output_folder = os.path.join(organized_images_dir, family_name)
    
    # Convert all .bytes files in this family folder
    convert_folder_to_images(input_folder, output_folder, width=256)

print("All .bytes files have been converted to images.")


Converting Backdoor_Kelihos_ver3: 100%|█████| 2942/2942 [10:20<00:00,  4.74it/s]
Converting Obfuscated_Malware: 100%|████████| 1228/1228 [00:36<00:00, 34.03it/s]
Converting Backdoor_Kelihos_ver1: 100%|███████| 398/398 [00:44<00:00,  8.99it/s]
Converting Trojan: 100%|██████████████████████| 475/475 [00:19<00:00, 24.06it/s]
Converting Worm: 100%|██████████████████████| 1541/1541 [01:20<00:00, 19.26it/s]
Converting Trojan_Downloader: 100%|███████████| 751/751 [00:46<00:00, 16.07it/s]
Converting Backdoor_Simda: 100%|████████████████| 42/42 [00:04<00:00,  8.63it/s]
Converting Backdoor_Gatak: 100%|████████████| 1013/1013 [01:25<00:00, 11.91it/s]
Converting Adware: 100%|████████████████████| 2478/2478 [07:21<00:00,  5.61it/s]

All .bytes files have been converted to images.



