1. PARSE XML FILES
2. TURN INTO TREE
3. CHECK TREE STATISTICS

In [2]:
import traceback
import xml.etree.ElementTree as ET
from pathlib import Path
import re

xmls = list(Path("traces-1m-worker_3_overnight_portclash").glob("*.xml")) + list(Path("traces-1m-worker_4_overnight_fixportclash").glob("*.xml"))
print(len(xmls), "XML files found,", len(set([f.name for f in xmls])), "unique. duplicates =", [f for f in xmls if [g.name for g in xmls].count(f.name) > 1])
# TODO remove duplicates

179 XML files found, 179 unique. duplicates = []


In [3]:
#!pip install tqdm jupyter ipywidgets

In [6]:
import tqdm.auto as tqdm

successes_1 = []
failed_1 = []
for fpath in tqdm.tqdm(xmls):
    try:
        root = ET.parse(fpath).getroot()
        # print(fpath, "parsed successfully")
        successes_1.append(fpath)
    except ET.ParseError as ex:
        # print("exception", type(ex).__name__, ex, "parsing", fpath)
        failed_1.append(fpath)

print("ROUND 1:", len(failed_1), "files failed parsing")
Path("SUCCESS_FILES_SHAGAGONOOK_1.txt").write_text("\n".join(map(str, successes_1)))

  0%|          | 0/179 [00:00<?, ?it/s]

ROUND 1: 140 files failed parsing


3279

In [18]:
import tqdm.auto as tqdm

fuzzer_start = re.compile(r'''<call[^>]*method="[^."]+.fuzzerTestOneInput\([^)]+\)"[^>]*''')
fuzzer_almost_end = re.compile(r'''<tracepoint[^>]+>''')
method_re = re.compile(r'''method="[^."]+.fuzzerTestOneInput\([^)]+\)"''')
exit_re = re.compile(r'''type="exit"''')
fuzzer_real_end = re.compile(r'''</call>''')


from xml.dom import minidom
from xml.parsers.expat import ExpatError

def prettify(rough_string):
    """Return a pretty-printed XML string for the Element.
    """
    # rough_string = ET.tostring(elem)
    reparsed = minidom.parseString(rough_string)
    
    # text = reparsed.toprettyxml(indent=" " * 2)
    # return "".join(text.splitlines(keepends=True)[1:])
    
    # return reparsed.childNodes[0].toprettyxml(indent=" " * 2)
    
    return reparsed.toprettyxml(indent="  ")

def recover_functions(fpath):
    with open(fpath) as f:
        xmlstring = f.read()
    xmllines = xmlstring.splitlines(keepends=True)
    it = iter(xmllines)
    repair_path = Path(str(fpath) + ".repair")
    failed_functions = 0
    all_functions = 0
    with open(repair_path, "w") as outf:
        with tqdm.tqdm(it, total=len(xmllines), desc="deconstruct into fuzzer target calls") as pbar:
            it = iter(pbar)
            fuzz_functions = []
            current_fuzz_function = None
            try:
                outf.write(next(it))
                while True:
                    line = next(it)
                    if fuzzer_start.search(line):
                        # print("start at", line)
                        # start fuzzed function
                        current_fuzz_function = []
                    if current_fuzz_function is not None:
                        current_fuzz_function.append(line)
                    m = fuzzer_almost_end.search(line)
                    if m:
                        tag = m.group(0)
                        if method_re.search(tag) and exit_re.search(tag):
                            # print("end at", line)
                            while True:
                                line = next(it)
                                # print("search end", line)
                                current_fuzz_function.append(line)
                                if fuzzer_real_end.search(line):
                                    # cap off fuzzed function
                                    # print("end", line)
                                    all_functions += 1
                                    try:
                                        func_xml = "".join(current_fuzz_function)
                                        # ET.fromstring(func_xml)
                                        # print(func_xml)
                                        func_xml = "".join(prettify(func_xml).splitlines(keepends=True)[1:])
                                        outf.write(func_xml + "\n")
                                    except (ExpatError, ET.ParseError):
                                        failed_functions += 1
                                    pbar.set_postfix({"all": all_functions, "failed": failed_functions})
                                    break
            except StopIteration:
                pass
        outf.write("</trace>")
    
#     print("found", len(fuzz_functions), "functions")

#     success_functions = []
#     failed_functions = 0
#     pbar = tqdm.tqdm(fuzz_functions, desc="parse individual fuzzer targets")
#     for fuzzed_function in pbar:
#         try:
#             root = ET.ElementTree(ET.fromstring(fuzzed_function)).getroot()
#             # ET.indent(root, space="\t", level=0)
#             success_functions.append(ET.tostring(root, encoding='unicode', method='xml'))
#         except ET.ParseError as ex:
#             # print("function exception", type(ex), ex, "parsing", fpath)
#             # print(fuzzed_function)
#             failed_functions += 1
#         pbar.set_postfix({"failed": failed_functions})
    # print("failed", failed_functions, "out of", all_functions, "functions")

#     success_text = "\n".join((xmllines[0], "".join(success_functions), "</trace>"))
#     success_text = prettify(success_text)
#     repair_path.write_text("".join(success_text))
    return repair_path

repair_path = recover_functions("traces-1m-worker_3_overnight_portclash/trace-apache-commons-cli-ParserFuzzer.xml")
root = ET.parse(repair_path).getroot()
repair_path

deconstruct into fuzzer target calls:   0%|          | 0/1145582 [00:00<?, ?it/s]

PosixPath('traces-1m-worker_3_overnight_portclash/trace-apache-commons-cli-ParserFuzzer.xml.repair')

In [20]:
successes_2 = []
failed_2 = []
for fpath in tqdm.tqdm(failed_1, position=1, desc="round 2"):
    repair_path = recover_functions(fpath)
    try:
        root = ET.parse(repair_path).getroot()
        successes_2.append(repair_path)
    except ET.ParseError as ex:
        print("exception", type(ex), ex, "parsing", fpath)
        failed_2.append(repair_path)

print("ROUND 2:", len(failed_2), "files failed parsing")
Path("SUCCESS_FILES_SHAGAGONOOK_2.txt").write_text("\n".join(successes_2))

# root = ET.parse('traces-1m/trace-angus-mail-BASE64EncoderStreamFuzzer.xml').getroot()
# root

round 2:   0%|          | 0/140 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1145582 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1041660 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/129 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35686 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/2206 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/33257 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1278714 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1329474 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/48058 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/9620 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/544 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/138 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1313343 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/842 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/2125803 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35898 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35696 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1001 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/138 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1144076 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/3720 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35506 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1304207 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/107132 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/252779 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/153 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/542408 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/550250 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/30498 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1087095 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/138 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1027 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/54972 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1468 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/136 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/5445417 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/518 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/9705 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/4163831 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1419724 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/547669 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/3809882 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/174804 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/445167 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/8543 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/34936 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/598985 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/465143 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/164 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/40841 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/648001 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/171857 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/642353 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35695 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35705 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/10369195 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/745 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/42274 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/5955248 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/171878 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/2951 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/271 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/36959 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/39474 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/36332 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/15905 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/11663908 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1881 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/4927 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1344 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/364881 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/175435 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/9215 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/32136 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1320 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/20732 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1616721 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/250 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/965138 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/141769 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/4927 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/41370 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/3837 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/68419 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/20467 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/47839 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/594232 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/9066846 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/761276 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/479 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1526765 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/147567 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/9614 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/445187 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/13511 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/34989 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/46057 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/179618 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/9621 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/3089229 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/141626 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/843662 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/279108 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/4927 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/36734 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/95768 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/547981 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/16601 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1024 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/133026 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/74342 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/16594 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35741 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1316564 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/138 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/7409360 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/391486 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/5488982 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/557 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/93677 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/653 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/684602 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35707 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/3802594 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/152089 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/9614 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/8030 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/342142 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35118 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1478319 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/3352475 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/35100 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/46305 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1462 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/38135 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/536236 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/3984 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/1428070 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/223744 [00:00<?, ?it/s]

deconstruct into fuzzer target calls:   0%|          | 0/9705 [00:00<?, ?it/s]

ROUND 2: 0 files failed parsing


TypeError: sequence item 0: expected str instance, PosixPath found

In [22]:
Path("SUCCESS_FILES_SHAGAGONOOK_2.txt").write_text("\n".join(map(str, successes_2)))

12671

In [23]:
all_files = successes_1 + successes_2

In [24]:
with open("SUCCESS_FILES_SHAGAGONOOK.txt", "w") as f:
    f.write("\n".join(map(str, all_files)))