1. PARSE XML FILES
2. TURN INTO TREE
3. IF FAILS, PARSE AS MANY OF THE ENTRIES AS POSSIBLE

In [18]:
import traceback
import xml.etree.ElementTree as ET
from pathlib import Path
import re
import shutil
import os

import git

repo = git.Repo('.', search_parent_directories=True)
os.chdir(repo.working_tree_dir)

srcpaths=(
    "/run/media/benjis/basilisk/Files/biggie/oss-fuzz/fuzz_10m_trace_3h/trace_run_3_success_3h/traces-10m/logs-xmls",
)

xmls = []
for d in srcpaths:
    l = list(Path(d).glob("*.xml"))
    l = [p for p in l if ".repair." not in p.name]
    xmls += l
print(len(xmls), "XML files found,", len(set([f.name for f in xmls])), "unique. duplicates =", [f for f in xmls if [g.name for g in xmls].count(f.name) > 1])

178 XML files found, 178 unique. duplicates = []


In [19]:
dstdir=Path("postprocessed_xmls")
if dstdir.exists():
    shutil.rmtree(dstdir)
dstdir.mkdir(exist_ok=True)

In [20]:
import tqdm as tqdm

successes_1 = []
failed_1 = []
for fpath in tqdm.tqdm(xmls):
    try:
        it = ET.iterparse(fpath)
        for tag in it:
            pass
        successes_1.append(fpath)
        shutil.copyfile(fpath, dstdir/fpath.name)
    except ET.ParseError as ex:
        failed_1.append(fpath)

print("ROUND 1:", len(failed_1), "files failed parsing")
(dstdir/"1_repair_success_1.txt").write_text("\n".join(map(str, successes_1)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 178/178 [1:20:47<00:00, 27.24s/it]


ROUND 1: 150 files failed parsing


4367

In [21]:
import tqdm as tqdm

fuzzer_start = re.compile(r'''<call[^>]*method="[^."]+.fuzzerTestOneInput\([^)]+\)"[^>]*''')
fuzzer_almost_end = re.compile(r'''<tracepoint[^>]+>''')
method_re = re.compile(r'''method="[^."]+.fuzzerTestOneInput\([^)]+\)"''')
exit_re = re.compile(r'''type="exit"''')
fuzzer_real_end = re.compile(r'''</call>''')


from xml.dom import minidom
from xml.parsers.expat import ExpatError

def prettify(rough_string):
    """Return a pretty-printed XML string for the Element.
    """
    reparsed = minidom.parseString(rough_string)
    
    return reparsed.toprettyxml(indent="  ")

def recover_functions(fpath):
    with open(fpath) as f:
        xmlstring = f.read()
    xmllines = xmlstring.splitlines(keepends=True)
    it = iter(xmllines)
    repair_path = dstdir/(str(fpath.name) + ".repair.xml")
    failed_functions = 0
    all_functions = 0
    with open(repair_path, "w") as outf:
        with tqdm.tqdm(it, total=len(xmllines), desc="deconstruct into fuzzer target calls") as pbar:
            it = iter(pbar)
            fuzz_functions = []
            current_fuzz_function = None
            try:
                outf.write(next(it))
                while True:
                    line = next(it)
                    if fuzzer_start.search(line):
                        # print("start at", line)
                        # start fuzzed function
                        current_fuzz_function = []
                    if current_fuzz_function is not None:
                        current_fuzz_function.append(line)
                    m = fuzzer_almost_end.search(line)
                    if m:
                        tag = m.group(0)
                        if method_re.search(tag) and exit_re.search(tag):
                            # print("end at", line)
                            while True:
                                line = next(it)
                                # print("search end", line)
                                current_fuzz_function.append(line)
                                if fuzzer_real_end.search(line):
                                    # cap off fuzzed function
                                    # print("end", line)
                                    all_functions += 1
                                    try:
                                        func_xml = "".join(current_fuzz_function)
                                        # ET.fromstring(func_xml)
                                        # print(func_xml)
                                        func_xml = "".join(prettify(func_xml).splitlines(keepends=True)[1:])
                                        outf.write(func_xml + "\n")
                                    except (ExpatError, ET.ParseError):
                                        failed_functions += 1
                                    pbar.set_postfix({"all": all_functions, "failed": failed_functions})
                                    break
            except StopIteration:
                pass
        outf.write("</trace>")

    return repair_path

In [None]:
successes_2 = []
failed_2 = []
for fpath in tqdm.tqdm(failed_1, position=1, desc="round 2"):
    repair_path = recover_functions(fpath)
    try:
        it = ET.iterparse(repair_path)
        for tag in it:
            pass
        successes_2.append(repair_path)
        # shutil.copyfile(repair_path, dstdir/repair_path.name)
    except ET.ParseError as ex:
        print("exception", type(ex), ex, "parsing", fpath)
        failed_2.append(repair_path)

print("ROUND 2:", len(failed_2), "files failed parsing")
(dstdir/"1_repair_success_2_part2.txt").write_text("\n".join(map(str, successes_2)))


deconstruct into fuzzer target calls: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 40372/40372 [00:00<00:00, 612504.72it/s][A

deconstruct into fuzzer target calls: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 83662/83662 [00:00<00:00, 1049899.95it/s][A

deconstruct into fuzzer target calls: 100%|█████████████████████████████████████████████████████████████████████████████████████| 5651425/5651425 [00:03<00:00, 1420344.26it/s][A

deconstruct into fuzzer target calls: 100%|███████████████████████████████████████████████████████████████████| 3756964/3756964 [04:58<00:00, 12580.17it/s, all=721, failed=14][A

deconstruct into fuzzer target calls: 100%|████████████████████████████████████████████████████████████████████| 3744975/3744975 [04:18<00:00, 14498.60it/s, all=128, failed=0][A

deconstruct into fuzzer target calls: 100%|████████████████████████████████████████████████████████

In [None]:
all_files = successes_1 + successes_2

In [None]:
with open(dstdir/"1_repair_success_all.txt", "w") as f:
    f.write("\n".join(map(str, all_files)))