## **Functionality:**
- Collect section level information for PE samples using 'pefile' module
- Store the gathered data as pickle file.

### **Input:**
- is_benign: Type of samples being processed [ True / False ]
- src_paths: List of paths to PE samples.
  
### **Output:**
- 'pickle_files' folder: Contains pickle files with below data format. [ Pickle file's name is MD5 hash of the corresponding sample. ]
            {
                name,
                md5,
                sha1,
                sha256,
                size_byte,
                benign,
                num_of_sections,
                section_info: {
                    <section_name> : {
                         section_data : [int_array]
                         section_size_byte : <int>,
                         section_bounds : {
                             start_offset: <int>
                             end_offset: <int>
                         }
                    }
                }
            }
    
### **Excluded Tags:** 
- entry_date, superficial, disasm, image_256w, folder

In [311]:
import os
import traceback
import pefile
import hashlib
import pickle
import time

In [312]:
def get_section_data(path, is_benign):
    total_count = 0
    unprocessed = 0
    
    for src_dir, dirs, files in os.walk(path):
        
        # Skip processing pickle file folder
        if src_dir.endswith("pickle_files"):
            continue
        
        for file_ in files:
            file_data = {}
            try:
                # Get absolute path
                src_file = os.path.join(src_dir, file_)
                pe = pefile.PE(src_file)
                
                # File Name
                file_data["name"] = file_
                
                # Hash information
                with open(src_file, 'rb') as fhandle:
                    file_byte_data = fhandle.read()
                    # MD5 Hash
                    file_data["md5"] = hashlib.md5(file_byte_data).hexdigest()
                    # SHA-1
                    file_data["sha1"] = hashlib.sha1(file_byte_data).hexdigest()
                    # SHA-256
                    file_data["sha256"] = hashlib.sha256(file_byte_data).hexdigest()
                
                # Size in Bytes
                file_data["size_byte"] = os.stat(src_file).st_size
                
                # Sample type: Benign or Malware
                file_data["benign"] = is_benign
                
                # Number of total sections in PE file
                file_data["num_of_sections"] = pe.FILE_HEADER.NumberOfSections
                
                # Collect information about each available section
                file_data["section_info"] = {}
                for section in pe.sections:
                    section_name = section.Name.rstrip(b'\x00').decode("utf-8").strip()
                    
                    section_data = {}
                    section_data["section_data"] = list(section.get_data())
                    section_data["section_size_byte"] = section.SizeOfRawData
                    
                    # section_bounds
                    section_data["section_bounds"] = {}
                    section_data["section_bounds"]["start_offset"] = section.PointerToRawData
                    section_data["section_bounds"]["end_offset"] = section.PointerToRawData + section.SizeOfRawData - 1
                    
                    # Add section details to file level json
                    file_data["section_info"][section_name] = section_data

                # Manually extracting Header information
                file_data["section_info"][".header"] = {
                                                            "section_data":list(pe.header),
                                                            "section_size_byte":len(pe.header), 
                                                            "section_bounds":{
                                                                                "start_offset":0,
                                                                                "end_offset":len(pe.header)
                                                                            }}

                # Save as pickle file
                with open(src_dir+"\\pickle_files\\"+file_data["md5"]+".pkl", "wb") as pkl:
                    pickle.dump(file_data, pkl)
                
                # Test saved data
                #with open(src_dir+"\\pickle_files\\"+file_data["md5"]+".pkl", "rb") as pkl:
                #    print(pickle.load(pkl)["num_of_sections"])   
                    
                total_count += 1

            except Exception as e:
                unprocessed += 1
                print("parse failed . . . [ Unprocessed #:", str(unprocessed), "] [ ERROR: " + str(e) + " ] [ FILE: ", src_file, "] ")
                # traceback.print_exc()
                
    return unprocessed



#### Initiate processing by setting 'is_benign' and 'src_paths'

In [313]:
start_time = time.time()

# Type of samples to be processed
is_benign = True  # False for malware

#  List of sample folders as source paths
src_paths = ["D:\\08_Dataset\\Huawei_DS\\data\\test"]

total_unprocessed = 0
for path in src_paths:
    
    # Create a folder within source path to store pickle files    
    destination_directory = path+"\\pickle_files"
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)
    
    # Collect section level details for all PE samples in given folder
    total_unprocessed += get_section_data(path, is_benign)

end_time = time.time()
print("\nData collection completed.\nTotal unprocessed files in all given paths: ", total_unprocessed)
print("Time elapsed: {0:.3f}".format((end_time - start_time)/60), "minute(s)")

parse failed . . . [ Unprocessed #: 1 ] [ ERROR: 'DOS Header magic not found.' ] [ FILE:  D:\08_Dataset\Huawei_DS\data\test\acsr.exe ] 

Data collection completed.
Total unprocessed files in all given paths:  1
Time elapsed: 0.004 minute(s)
