1. PARSE XML FILES
2. TURN INTO TREE
3. IF FAILS, PARSE AS MANY OF THE ENTRIES AS POSSIBLE

In [1]:
import traceback
import xml.etree.ElementTree as ET
from pathlib import Path
import re
import shutil
import os

import git

repo = git.Repo('.', search_parent_directories=True)
os.chdir(repo.working_tree_dir)

# srcpaths=(
#     "traces-1m-worker_3_overnight_portclash",
#     "traces-1m-worker_4_overnight_fixportclash",
# )
srcpaths=(
    "trace_run_3_success_3h/traces-10m/logs-xmls",
)

xmls = []
for d in srcpaths:
    l = list(Path(d).glob("*.xml"))
    xmls += l
print(len(xmls), "XML files found,", len(set([f.name for f in xmls])), "unique. duplicates =", [f for f in xmls if [g.name for g in xmls].count(f.name) > 1])

285 XML files found, 285 unique. duplicates = []


In [2]:
dstdir=Path("postprocessed")

if dstdir.exists():
    shutil.rmtree(dstdir)
dstdir.mkdir()

In [3]:
#!pip install tqdm jupyter ipywidgets

In [4]:
import tqdm as tqdm

successes_1 = []
failed_1 = []
for fpath in tqdm.tqdm(xmls):
    try:
        root = ET.parse(fpath).getroot()
        # print(fpath, "parsed successfully")
        successes_1.append(fpath)
        shutil.copyfile(fpath, dstdir/fpath.name)
    except ET.ParseError as ex:
        # print("exception", type(ex).__name__, ex, "parsing", fpath)
        failed_1.append(fpath)

print("ROUND 1:", len(failed_1), "files failed parsing")
(dstdir/"1_repair_success_1.txt").write_text("\n".join(map(str, successes_1)))

100%|█████████████████████████████████████████| 285/285 [56:50<00:00, 11.97s/it]


ROUND 1: 150 files failed parsing


12980

In [5]:
import tqdm as tqdm

fuzzer_start = re.compile(r'''<call[^>]*method="[^."]+.fuzzerTestOneInput\([^)]+\)"[^>]*''')
fuzzer_almost_end = re.compile(r'''<tracepoint[^>]+>''')
method_re = re.compile(r'''method="[^."]+.fuzzerTestOneInput\([^)]+\)"''')
exit_re = re.compile(r'''type="exit"''')
fuzzer_real_end = re.compile(r'''</call>''')


from xml.dom import minidom
from xml.parsers.expat import ExpatError

def prettify(rough_string):
    """Return a pretty-printed XML string for the Element.
    """
    # rough_string = ET.tostring(elem)
    reparsed = minidom.parseString(rough_string)
    
    # text = reparsed.toprettyxml(indent=" " * 2)
    # return "".join(text.splitlines(keepends=True)[1:])
    
    # return reparsed.childNodes[0].toprettyxml(indent=" " * 2)
    
    return reparsed.toprettyxml(indent="  ")

def recover_functions(fpath):
    with open(fpath) as f:
        xmlstring = f.read()
    xmllines = xmlstring.splitlines(keepends=True)
    it = iter(xmllines)
    repair_path = Path(str(fpath) + ".repair.xml")
    failed_functions = 0
    all_functions = 0
    with open(repair_path, "w") as outf:
        with tqdm.tqdm(it, total=len(xmllines), desc="deconstruct into fuzzer target calls") as pbar:
            it = iter(pbar)
            fuzz_functions = []
            current_fuzz_function = None
            try:
                outf.write(next(it))
                while True:
                    line = next(it)
                    if fuzzer_start.search(line):
                        # print("start at", line)
                        # start fuzzed function
                        current_fuzz_function = []
                    if current_fuzz_function is not None:
                        current_fuzz_function.append(line)
                    m = fuzzer_almost_end.search(line)
                    if m:
                        tag = m.group(0)
                        if method_re.search(tag) and exit_re.search(tag):
                            # print("end at", line)
                            while True:
                                line = next(it)
                                # print("search end", line)
                                current_fuzz_function.append(line)
                                if fuzzer_real_end.search(line):
                                    # cap off fuzzed function
                                    # print("end", line)
                                    all_functions += 1
                                    try:
                                        func_xml = "".join(current_fuzz_function)
                                        # ET.fromstring(func_xml)
                                        # print(func_xml)
                                        func_xml = "".join(prettify(func_xml).splitlines(keepends=True)[1:])
                                        outf.write(func_xml + "\n")
                                    except (ExpatError, ET.ParseError):
                                        failed_functions += 1
                                    pbar.set_postfix({"all": all_functions, "failed": failed_functions})
                                    break
            except StopIteration:
                pass
        outf.write("</trace>")
    
#     print("found", len(fuzz_functions), "functions")

#     success_functions = []
#     failed_functions = 0
#     pbar = tqdm.tqdm(fuzz_functions, desc="parse individual fuzzer targets")
#     for fuzzed_function in pbar:
#         try:
#             root = ET.ElementTree(ET.fromstring(fuzzed_function)).getroot()
#             # ET.indent(root, space="\t", level=0)
#             success_functions.append(ET.tostring(root, encoding='unicode', method='xml'))
#         except ET.ParseError as ex:
#             # print("function exception", type(ex), ex, "parsing", fpath)
#             # print(fuzzed_function)
#             failed_functions += 1
#         pbar.set_postfix({"failed": failed_functions})
    # print("failed", failed_functions, "out of", all_functions, "functions")

#     success_text = "\n".join((xmllines[0], "".join(success_functions), "</trace>"))
#     success_text = prettify(success_text)
#     repair_path.write_text("".join(success_text))
    return repair_path

# repair_path = recover_functions("traces-1m-worker_3_overnight_portclash/trace-apache-commons-cli-ParserFuzzer.xml")
# root = ET.parse(repair_path).getroot()
# repair_path

In [None]:
successes_2 = []
failed_2 = []
for fpath in tqdm.tqdm(failed_1, position=1, desc="round 2"):
    repair_path = recover_functions(fpath)
    try:
        root = ET.parse(repair_path).getroot()
        successes_2.append(repair_path)
        shutil.copyfile(repair_path, dstdir/repair_path.name)
    except ET.ParseError as ex:
        print("exception", type(ex), ex, "parsing", fpath)
        failed_2.append(repair_path)

print("ROUND 2:", len(failed_2), "files failed parsing")
(dstdir/"1_repair_success_2.txt").write_text("\n".join(map(str, successes_2)))

# root = ET.parse('traces-1m/trace-angus-mail-BASE64EncoderStreamFuzzer.xml').getroot()
# root


deconstruct into fuzzer target calls: 100%|█| 3756964/3756964 [04:36<00:00, 1356[A

deconstruct into fuzzer target calls: 100%|█| 307/307 [00:00<00:00, 400015.95it/[A

deconstruct into fuzzer target calls: 100%|█| 2384900/2384900 [02:28<00:00, 1601[A

deconstruct into fuzzer target calls: 100%|█| 173/173 [00:00<00:00, 595253.97it/[A

deconstruct into fuzzer target calls: 100%|█| 37012/37012 [00:00<00:00, 1162660.[A
deconstruct into fuzzer target calls: 100%|█| 35773/35773 [00:00<00:00, 1161475.

deconstruct into fuzzer target calls: 100%|█| 39325/39325 [00:00<00:00, 1196855.[A
deconstruct into fuzzer target calls: 100%|█| 36378/36378 [00:00<00:00, 1132280.

deconstruct into fuzzer target calls: 100%|█| 15984/15984 [00:00<00:00, 1260278.[A

deconstruct into fuzzer target calls: 100%|█| 40046572/40046572 [1:09:49<00:00, [A

deconstruct into fuzzer target calls: 100%|█| 1902/1902 [00:00<00:00, 1047749.70[A

deconstruct into fuzzer target calls: 100%|█| 5009/5009 [00:00<00:00, 10

deconstruct into fuzzer target calls: 100%|█| 74405/74405 [00:00<00:00, 1197965.[A

deconstruct into fuzzer target calls: 100%|█| 16670/16670 [00:00<00:00, 1206853.[A
deconstruct into fuzzer target calls: 100%|█| 35200/35200 [00:00<00:00, 957454.6

deconstruct into fuzzer target calls: 100%|█| 2625508/2625508 [02:51<00:00, 1530[A

deconstruct into fuzzer target calls: 100%|█| 205/205 [00:00<00:00, 396601.62it/[A

deconstruct into fuzzer target calls: 100%|█| 22398561/22398561 [36:36<00:00, 10[A

deconstruct into fuzzer target calls: 100%|█| 14/14 [00:00<00:00, 203889.78it/s][A

deconstruct into fuzzer target calls: 100%|█| 363459/363459 [00:00<00:00, 128329[A

deconstruct into fuzzer target calls: 100%|█| 11218417/11218417 [07:32<00:00, 24[A

round 2:  71%|███████████████████        | 106/150 [4:58:42<5:03:31, 413.89s/it][A

In [None]:
all_files = successes_1 + successes_2

In [None]:
with open(dstdir/"1_repair_success_all.txt", "w") as f:
    f.write("\n".join(map(str, all_files)))