In [1]:
!pip -q install lief pefile pyelftools yara-python capstone

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.8/74.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, re, json, math, hashlib
import lief
import pefile
from elftools.elf.elffile import ELFFile
from capstone import Cs, CS_ARCH_X86, CS_MODE_64

SAMPLE_PATH = None

In [3]:
# --- helpers ---
def sha256_file(path, chunk=1024*1024):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for b in iter(lambda: f.read(chunk), b""):
            h.update(b)
    return h.hexdigest()

def md5_file(path, chunk=1024*1024):
    h = hashlib.md5()
    with open(path, "rb") as f:
        for b in iter(lambda: f.read(chunk), b""):
            h.update(b)
    return h.hexdigest()

def shannon_entropy(data: bytes) -> float:
    if not data:
        return 0.0
    counts = [0]*256
    for x in data:
        counts[x] += 1
    ent = 0.0
    n = len(data)
    for c in counts:
        if c:
            p = c/n
            ent -= p * math.log2(p)
    return ent

def extract_ascii_strings(data: bytes, min_len=4):
    # ASCII printable strings
    pattern = rb"[ -~]{%d,}" % min_len
    return [m.group(0).decode("ascii", errors="ignore") for m in re.finditer(pattern, data)]

def find_iocs(strings):
    text = "\n".join(strings)
    iocs = {
        "urls": sorted(set(re.findall(r"https?://[^\s\"\'<>]+", text))),
        "domains": sorted(set(re.findall(r"\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b", text))),
        "ips": sorted(set(re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text))),
        "emails": sorted(set(re.findall(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", text))),
        "registry": sorted(set(re.findall(r"(?:HKLM|HKCU|HKEY_LOCAL_MACHINE|HKEY_CURRENT_USER)\\[^\s\"\'<>]+", text))),
        "suspicious_keywords": []
    }
    keywords = [
        "powershell", "cmd.exe", "rundll32", "regsvr32", "schtasks", "wmic", "vssadmin",
        "DisableRealtimeMonitoring", "Add-MpPreference", "mimikatz", "keylogger",
        "Crypto", "AES", "RC4", "base64", "tor", ".onion",
        "CreateRemoteThread", "VirtualAlloc", "WriteProcessMemory", "WinExec",
        "InternetOpen", "HttpSendRequest", "URLDownloadToFile", "WSAStartup"
    ]
    low = text.lower()
    iocs["suspicious_keywords"] = sorted({k for k in keywords if k.lower() in low})
    return iocs

def detect_type(data: bytes):
    # rough magic checks
    if data.startswith(b"MZ"):
        return "PE (Windows)"
    if data.startswith(b"\x7fELF"):
        return "ELF (Linux)"
    if data.startswith(b"\xCA\xFE\xBA\xBE") or data.startswith(b"\xBE\xBA\xFE\xCA"):
        return "Mach-O (macOS)"
    return "Unknown/Raw"

In [4]:
if not SAMPLE_PATH or not os.path.exists(SAMPLE_PATH):
    print("Set SAMPLE_PATH to an existing file, e.g. '/kaggle/input/<dataset>/<file>'.")
else:
    with open(SAMPLE_PATH, "rb") as f:
        data = f.read()

    report = {
        "path": SAMPLE_PATH,
        "size_bytes": len(data),
        "sha256": sha256_file(SAMPLE_PATH),
        "md5": md5_file(SAMPLE_PATH),
        "file_type_guess": detect_type(data),
        "entropy": round(shannon_entropy(data), 3),
    }

    # 2) strings + IOCs
    strings = extract_ascii_strings(data, min_len=5)
    report["strings_count"] = len(strings)
    report["top_strings_sample"] = strings[:30]
    report["iocs"] = find_iocs(strings)

    # 3) parse metadata (PE/ELF via LIEF, and PE extras via pefile)
    try:
        bin = lief.parse(SAMPLE_PATH)
        report["lief_format"] = str(bin.format) if bin else None

        if bin and bin.format == lief.EXE_FORMATS.PE:
            pe = pefile.PE(SAMPLE_PATH, fast_load=True)
            pe.parse_data_directories(directories=[
                pefile.DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_IMPORT"],
                pefile.DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_EXPORT"],
            ])

            imports = []
            if hasattr(pe, "DIRECTORY_ENTRY_IMPORT"):
                for entry in pe.DIRECTORY_ENTRY_IMPORT:
                    dll = entry.dll.decode(errors="ignore")
                    for imp in entry.imports[:200]:  # cap
                        name = (imp.name.decode(errors="ignore") if imp.name else f"ord:{imp.ordinal}")
                        imports.append(f"{dll}!{name}")

            sections = []
            for s in pe.sections:
                sections.append({
                    "name": s.Name.decode(errors="ignore").strip("\x00"),
                    "vsize": int(s.Misc_VirtualSize),
                    "rsize": int(s.SizeOfRawData),
                    "entropy": round(s.get_entropy(), 3)
                })

            report["pe"] = {
                "machine": hex(pe.FILE_HEADER.Machine),
                "timestamp": int(pe.FILE_HEADER.TimeDateStamp),
                "subsystem": int(pe.OPTIONAL_HEADER.Subsystem),
                "entry_point": hex(pe.OPTIONAL_HEADER.AddressOfEntryPoint),
                "image_base": hex(pe.OPTIONAL_HEADER.ImageBase),
                "imports_sample": imports[:200],
                "sections": sections,
            }

        elif bin and bin.format == lief.EXE_FORMATS.ELF:
            with open(SAMPLE_PATH, "rb") as f:
                elf = ELFFile(f)
                sec_names = [sec.name for sec in elf.iter_sections()]
                report["elf"] = {
                    "entry_point": hex(elf.header["e_entry"]),
                    "is_64bit": elf.elfclass == 64,
                    "sections_sample": sec_names[:80],
                }
    except Exception as e:
        report["parse_error"] = str(e)

    # 4) YARA scan (simple starter rules)
    import yara
    yara_rules = r"""
rule Suspicious_Commands {
  strings:
    $a = "powershell" nocase
    $b = "cmd.exe" nocase
    $c = "rundll32" nocase
    $d = "schtasks" nocase
  condition:
    any of them
}
rule Process_Injection_APIs {
  strings:
    $a = "CreateRemoteThread" nocase
    $b = "WriteProcessMemory" nocase
    $c = "VirtualAlloc" nocase
  condition:
    2 of them
}
rule Network_APIs {
  strings:
    $a = "InternetOpen" nocase
    $b = "HttpSendRequest" nocase
    $c = "WSAStartup" nocase
  condition:
    any of them
}
"""
    rules = yara.compile(source=yara_rules)
    matches = rules.match(data=data)
    report["yara_matches"] = [m.rule for m in matches]

    # 5) Optional: tiny disassembly if it looks like x86-64 code blob (heuristic)
    # For real binaries, you’d disassemble .text; here we just demo first 64 bytes after entry if PE parsed.
    disasm = []
    try:
        if report.get("pe"):
            pe = pefile.PE(SAMPLE_PATH, fast_load=True)
            ep = pe.OPTIONAL_HEADER.AddressOfEntryPoint
            image_base = pe.OPTIONAL_HEADER.ImageBase
            ep_rva = ep
            ep_offset = pe.get_offset_from_rva(ep_rva)
            code_bytes = data[ep_offset:ep_offset+64]

            md = Cs(CS_ARCH_X86, CS_MODE_64)
            for ins in md.disasm(code_bytes, image_base + ep_rva):
                disasm.append(f"0x{ins.address:x}: {ins.mnemonic} {ins.op_str}")
        report["disasm_sample"] = disasm[:30]
    except Exception:
        report["disasm_sample"] = []

    # Print a readable summary + save JSON report
    print("=== Malware Static Triage Summary ===")
    print("Path:", report["path"])
    print("Type:", report["file_type_guess"])
    print("Size:", report["size_bytes"], "bytes")
    print("SHA256:", report["sha256"])
    print("Entropy:", report["entropy"], "(high entropy can indicate packing/encryption)")
    print("YARA matches:", report["yara_matches"])
    print("IOC highlights:")
    print("  URLs:", report["iocs"]["urls"][:10])
    print("  IPs :", report["iocs"]["ips"][:10])
    print("  Domains:", report["iocs"]["domains"][:10])
    print("  Suspicious keywords:", report["iocs"]["suspicious_keywords"][:20])

    if report.get("pe"):
        print("\nPE imports (sample):", report["pe"]["imports_sample"][:30])
        print("Sections:", [(s["name"], s["entropy"]) for s in report["pe"]["sections"]])

    if report.get("disasm_sample"):
        print("\nDisassembly (sample):")
        for line in report["disasm_sample"]:
            print(" ", line)

    out = "/kaggle/working/malware_triage_report.json"
    with open(out, "w") as f:
        json.dump(report, f, indent=2)
    print("\nSaved JSON report:", out)

Set SAMPLE_PATH to an existing file, e.g. '/kaggle/input/<dataset>/<file>'.
