In this exercise, we use the pefile module to analyze PE malware samples in the "Mediyes" folder. The goal is to extract the names of the PE sections, as well as the names of imported DLLs from each sample. 

In [2]:
# To install pefile, uncomment and execute the following line:
!pip install pefile

Defaulting to user installation because normal site-packages is not writeable
Collecting pefile
  Downloading pefile-2023.2.7-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.8/71.8 KB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pefile
Successfully installed pefile-2023.2.7


In [6]:
from os import listdir
from os.path import isfile, join
directories = ["Mediyes"]
import pefile

In [4]:
# takes input such as [b'ADVAPI32.dll', b'KERNEL32.dll', b'msvcrt.dll']
# and converts case to lower and removes .dll 
def preprocessImports(listOfDLLs):
    processedListOfDLLs = []
    return [x.decode().split(".")[0].lower() for x in listOfDLLs]

def getImports(pe):
    listOfImports = []
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        listOfImports.append(entry.dll)
    return preprocessImports(listOfImports)

def getSectionNames(pe):
    listOfSectionNames = []
    for eachSection in pe.sections:
        refined_name = eachSection.Name.decode().replace('\x00','').lower()
        listOfSectionNames.append(refined_name)
    return listOfSectionNames

The following block may take a couple of minutes to run. It is OK to see 'utf-8' decoding error messages for some of the files.

In [5]:
importsCorpus = []
numSections = []
sectionNames = []
print (directories)
for datasetPath in directories:
    samples = [f for f in listdir(datasetPath) if isfile(join(datasetPath,f))]
    for file in samples:
        filePath = datasetPath+"/"+file
        try:
            pe = pefile.PE(filePath)
            imports = getImports(pe)
            nSections = len(pe.sections)
            secNames = getSectionNames(pe)
            importsCorpus.append(imports)
            numSections.append(nSections)
            sectionNames.append(secNames)
                  
        except Exception as e: 
            print(e)
            print("Unable to obtain imports from "+filePath)

['Mediyes']
'utf-8' codec can't decode byte 0xb1 in position 0: invalid start byte
Unable to obtain imports from Mediyes/VirusShare_1a89b7d4fb8ded72e1f8e81ee9352262.exe
'utf-8' codec can't decode byte 0xb8 in position 0: invalid start byte
Unable to obtain imports from Mediyes/VirusShare_7a30183b105b4200fc201925aba4886c.exe
'utf-8' codec can't decode byte 0x8d in position 0: invalid start byte
Unable to obtain imports from Mediyes/VirusShare_14f3035781bb698c37ad287483af569e.exe


In [4]:
print(importsCorpus[0:5])
print(numSections[0:5])
print(sectionNames[0:5])

[['ws2_32', 'rpcrt4', 'kernel32', 'user32', 'advapi32', 'ole32', 'oleaut32'], ['ntoskrnl', 'hal'], ['ws2_32', 'rpcrt4', 'kernel32', 'user32', 'advapi32', 'ole32', 'oleaut32'], ['ntoskrnl', 'hal'], ['ntoskrnl', 'hal']]
[5, 6, 5, 7, 6]
[['.text', '.rdata', '.data', '.rsrc', '.reloc'], ['.text', '.rdata', '.data', 'init', '.rsrc', '.reloc'], ['.text', '.rdata', '.data', '.rsrc', '.reloc'], ['.text', '.rdata', '.data', 'page', 'init', '.rsrc', '.reloc'], ['.text', '.rdata', '.data', 'init', '.rsrc', '.reloc']]


**Exercise:** Find and print the names of the top 5 most frequently imported DLLs in the "Mediyes" malware dataset. Use a process lookup website (e.g., processlibrary.com) to find the function of each of these top 5 DLLs. Do you see anything suspicious in these calls?

In [8]:
# Your code
from collections import Counter

# Assuming importsCorpus is populated with the list of DLLs from all the files as per your provided code
all_imports = [dll for sublist in importsCorpus for dll in sublist]  # Flatten the list of lists
dll_counts = Counter(all_imports)  # Count occurrences of each DLL

# Find the top 5 most common DLLs
top_5_dlls = dll_counts.most_common(5)
print("Top 5 DLLs:", top_5_dlls)

# For each of the top 5 DLLs, you would typically lookup their functions on a process lookup website.
# Since I can't browse the web directly, I'll list the DLLs, and you can look them up manually.
for dll, count in top_5_dlls:
    print(f"{dll} is imported {count} times.")
    # Here you would use a process lookup website to find the function of each DLL.


Top 5 DLLs: [('kernel32', 302), ('advapi32', 287), ('ws2_32', 283), ('user32', 282), ('ole32', 282)]
kernel32 is imported 302 times.
advapi32 is imported 287 times.
ws2_32 is imported 283 times.
user32 is imported 282 times.
ole32 is imported 282 times.


 -- Top 5 DLLs: [('kernel32', 302), ('advapi32', 287), ('ws2_32', 283), ('user32', 282), ('ole32', 282)]
kernel32 is imported 302 times.
advapi32 is imported 287 times.
ws2_32 is imported 283 times.
user32 is imported 282 times.
ole32 is imported 282 times.