1. PARSE XML FILES
2. TURN INTO TREE
3. IF FAILS, PARSE AS MANY OF THE ENTRIES AS POSSIBLE

In [12]:
import traceback
import xml.etree.ElementTree as ET
from pathlib import Path
import re
import shutil
import os

import git

repo = git.Repo('.', search_parent_directories=True)
os.chdir(repo.working_tree_dir)

srcpaths=(
    "traces-10m/logs-xmls",
)

xmls = []
for d in srcpaths:
    l = list(Path(d).glob("*.xml"))
    l = [p for p in l if ".repair." not in p.name]
    xmls += l
print(len(xmls), "XML files found,", len(set([f.name for f in xmls])), "unique. duplicates =", [f for f in xmls if [g.name for g in xmls].count(f.name) > 1])

56 XML files found, 56 unique. duplicates = []


In [13]:
dstdir=Path("traces-10m/postprocessed-xmls")

In [14]:
if dstdir.exists():
    shutil.rmtree(dstdir)
dstdir.mkdir(exist_ok=True)

In [15]:
import tqdm as tqdm

successes_1 = []
failed_1 = []
with tqdm.tqdm(xmls) as pbar:
    for fpath in pbar:
        try:
            it = ET.iterparse(fpath)
            for tag in it:
                pass
            successes_1.append(fpath)
            shutil.copyfile(fpath, dstdir/fpath.name)
        except ET.ParseError as ex:
            failed_1.append(fpath)
        pbar.set_postfix({
            "success": len(successes_1),
            "failed": len(failed_1),
        })

print("ROUND 1:", len(failed_1), "files failed parsing")
(dstdir/"1_repair_success_1.txt").write_text("\n".join(map(str, successes_1)))

100%|██████████| 56/56 [04:27<00:00,  4.78s/it, success=19, failed=37]

ROUND 1: 37 files failed parsing





1137

In [16]:
import tqdm as tqdm

fuzzer_start = re.compile(r'''<call[^>]*method="[^("]+.fuzzerTestOneInput\([^)]+\)"[^>]*''')

from xml.dom import minidom
from xml.parsers.expat import ExpatError

def recover_functions_simple(fpath, dstdir):
    repair_path = dstdir/(str(fpath.name) + ".repair.xml")
    if repair_path.exists():
        return repair_path
    
#     with open(fpath) as f:
#         num_lines = sum(1 for line in tqdm.tqdm(f, desc=f"count lines ({fpath.name})"))
    last_fuzzer_start = None
    with open(fpath) as f:
        i = 0
        it = iter(f)
        failed_functions = 0
        all_functions = 0
        with tqdm.tqdm(it, total=None, desc=f"chop final fuzzer target call ({fpath.name})") as pbar:
            it = pbar
            fuzz_functions = []
            current_fuzz_function = None
            for i, line in enumerate(it):
                m = fuzzer_start.search(line)
                if m is not None:
#                     print("MATCH", m, m.start(), m.span())
                    j = m.start()
                    last_fuzzer_start = (i, j)
#     print("LFS", last_fuzzer_start)
    if last_fuzzer_start is None:
        print("ERROR: last_fuzzer_start is None", fpath)
        return None
    
    with open(fpath) as f, open(repair_path, "w") as outf:
        it = iter(f)
        with tqdm.tqdm(it, total=last_fuzzer_start[0], desc=f"write file ({fpath.name})") as pbar:
            it = pbar
            for i, line in enumerate(it):
#                 print(i)
                if i == last_fuzzer_start[0]:
#                     print("LINE:")
#                     print(line)
                    line_chopped = line[:last_fuzzer_start[1]]
#                     print("CHOPPED:")
#                     print(line_chopped)
                    outf.write(line_chopped)
                    break
                else:
                    outf.write(line)
            outf.write("</trace>")
        
    return repair_path

# xml_file = Path("/run/media/benjis/basilisk/Files/biggie/oss-fuzz/fuzz_10m_trace_3h/trace_run_3_success_3h/traces-10m/logs-xmls")/"trace-apache-commons-cli-ParserFuzzer.xml"
# recover_functions_simple(xml_file, Path("postprocessed_xmls_debug"))

In [17]:
# fpath = Path("/run/media/benjis/basilisk/Files/biggie/oss-fuzz/fuzz_10m_trace_3h/trace_run_3_success_3h/traces-10m/logs-xmls")/"trace-apache-commons-cli-ParserFuzzer.xml"
# repair_path = recover_functions_simple(fpath, Path("postprocessed_xmls_debug"))
# print(fpath, "->", repair_path)
# it = ET.iterparse(repair_path)
# for tag in tqdm.tqdm(it, desc=f"check XML parse ({fpath.name})"):
#     pass

In [18]:
# successes_1 = list(map(Path, (dstdir/"1_repair_success_1.txt").read_text().split("\n")))
# failed_1 = [fpath for fpath in xmls if fpath.name not in [s.name for s in successes_1]]
# len(successes_1), len(failed_1)

In [19]:
import tqdm
successes_2 = []
failed_2 = []
with tqdm.tqdm(failed_1, position=1, desc="round 2") as pbar:
    for fpath in pbar:
        repair_path = recover_functions_simple(fpath, dstdir)
        if repair_path is None:
            print("ERROR", fpath)
            continue
        try:
            it = ET.iterparse(repair_path)
            for tag in tqdm.tqdm(it, desc=f"check XML parse ({fpath.name})"):
                pass
            successes_2.append(repair_path)
            # shutil.copyfile(repair_path, dstdir/repair_path.name)
        except ET.ParseError as ex:
            print("exception", type(ex), ex, "parsing", fpath, repair_path)
            failed_2.append(repair_path)
        pbar.set_postfix({
            "success": len(successes_2),
            "failed": len(failed_2),
        })

print("ROUND 2:", len(failed_2), "files failed parsing")
(dstdir/"1_repair_success_2.txt").write_text("\n".join(map(str, successes_2)))

chop final fuzzer target call (trace-apache-commons-configuration-JSONConfigurationReadFuzzer.xml): 51711it [00:00, 257978.01it/s]
write file (trace-apache-commons-configuration-JSONConfigurationReadFuzzer.xml): 100%|██████████| 1/1 [00:00<00:00, 3876.44it/s]
check XML parse (trace-apache-commons-configuration-JSONConfigurationReadFuzzer.xml): 1it [00:00, 13.75it/s]
chop final fuzzer target call (trace-apache-commons-configuration-XMLConfigurationLoadFuzzer.xml): 51930it [00:00, 218018.20it/s]
write file (trace-apache-commons-configuration-XMLConfigurationLoadFuzzer.xml): 100%|██████████| 1/1 [00:00<00:00, 4258.18it/s]
check XML parse (trace-apache-commons-configuration-XMLConfigurationLoadFuzzer.xml): 1it [00:00, 3659.95it/s]
chop final fuzzer target call (trace-apache-commons-configuration-XMLConfigurationWriteFuzzer.xml): 18880it [00:00, 231878.78it/s]
write file (trace-apache-commons-configuration-XMLConfigurationWriteFuzzer.xml): 100%|██████████| 1/1 [00:00<00:00, 5178.15it/s]
che

exception <class 'xml.etree.ElementTree.ParseError'> mismatched tag: line 8, column 2 parsing traces-10m/logs-xmls/trace-groovy-TestFuzzer.xml traces-10m/postprocessed-xmls/trace-groovy-TestFuzzer.xml.repair.xml


chop final fuzzer target call (trace-guava-HostSpecifierFuzzer.xml): 646227it [00:02, 291541.38it/s]
write file (trace-guava-HostSpecifierFuzzer.xml): 100%|██████████| 2936/2936 [00:00<00:00, 186106.43it/s]
check XML parse (trace-guava-HostSpecifierFuzzer.xml): 5546it [00:00, 108449.78it/s]
chop final fuzzer target call (trace-guava-InternetDomainNameFuzzer.xml): 704127it [00:02, 330913.44it/s]
write file (trace-guava-InternetDomainNameFuzzer.xml): 100%|██████████| 7233/7233 [00:00<00:00, 244022.79it/s]
check XML parse (trace-guava-InternetDomainNameFuzzer.xml): 11755it [00:00, 63818.30it/s]
chop final fuzzer target call (trace-guava-MediaTypeFuzzer.xml): 10417it [00:00, 422823.71it/s]
write file (trace-guava-MediaTypeFuzzer.xml): 100%|██████████| 1/1 [00:00<00:00, 4328.49it/s]
check XML parse (trace-guava-MediaTypeFuzzer.xml): 1it [00:00, 196.16it/s]
chop final fuzzer target call (trace-guice-InjectorFuzzer.xml): 57750it [00:00, 569891.67it/s]
write file (trace-guice-InjectorFuzzer.xm

exception <class 'xml.etree.ElementTree.ParseError'> mismatched tag: line 20, column 2 parsing traces-10m/logs-xmls/trace-h2database-ServerLoginFuzzer.xml traces-10m/postprocessed-xmls/trace-h2database-ServerLoginFuzzer.xml.repair.xml


chop final fuzzer target call (trace-h2database-ShellFuzzer.xml): 177229it [00:00, 764406.35it/s]
write file (trace-h2database-ShellFuzzer.xml): 100%|██████████| 19/19 [00:00<00:00, 81735.15it/s]
check XML parse (trace-h2database-ShellFuzzer.xml): 13it [00:00, 36062.14it/s]


exception <class 'xml.etree.ElementTree.ParseError'> mismatched tag: line 20, column 2 parsing traces-10m/logs-xmls/trace-h2database-ShellFuzzer.xml traces-10m/postprocessed-xmls/trace-h2database-ShellFuzzer.xml.repair.xml


chop final fuzzer target call (trace-h2database-SqlPreparedStatementFuzzer.xml): 177229it [00:00, 784035.10it/s]
write file (trace-h2database-SqlPreparedStatementFuzzer.xml): 100%|██████████| 19/19 [00:00<00:00, 80741.41it/s]
check XML parse (trace-h2database-SqlPreparedStatementFuzzer.xml): 13it [00:00, 36423.48it/s]


exception <class 'xml.etree.ElementTree.ParseError'> mismatched tag: line 20, column 2 parsing traces-10m/logs-xmls/trace-h2database-SqlPreparedStatementFuzzer.xml traces-10m/postprocessed-xmls/trace-h2database-SqlPreparedStatementFuzzer.xml.repair.xml


chop final fuzzer target call (trace-h2database-SqlStatementFuzzer.xml): 177229it [00:00, 761633.02it/s]
write file (trace-h2database-SqlStatementFuzzer.xml): 100%|██████████| 19/19 [00:00<00:00, 82925.89it/s]
check XML parse (trace-h2database-SqlStatementFuzzer.xml): 13it [00:00, 34752.04it/s]


exception <class 'xml.etree.ElementTree.ParseError'> mismatched tag: line 20, column 2 parsing traces-10m/logs-xmls/trace-h2database-SqlStatementFuzzer.xml traces-10m/postprocessed-xmls/trace-h2database-SqlStatementFuzzer.xml.repair.xml


chop final fuzzer target call (trace-hamcrest-HamcrestFuzzer.xml): 188502it [00:00, 714769.75it/s]
write file (trace-hamcrest-HamcrestFuzzer.xml): 100%|██████████| 1/1 [00:00<00:00, 4202.71it/s]
check XML parse (trace-hamcrest-HamcrestFuzzer.xml): 1it [00:00, 2966.27it/s]
chop final fuzzer target call (trace-httpcomponents-client-FormBodyPartBuilderBuildFuzzer.xml): 1622775it [00:02, 732720.45it/s]
write file (trace-httpcomponents-client-FormBodyPartBuilderBuildFuzzer.xml): 100%|██████████| 1622066/1622066 [00:03<00:00, 482957.00it/s]
check XML parse (trace-httpcomponents-client-FormBodyPartBuilderBuildFuzzer.xml): 1427554it [00:16, 84508.58it/s] 
chop final fuzzer target call (trace-httpcomponents-client-HttpFuzzer.xml): 1098710it [00:03, 283952.93it/s]
write file (trace-httpcomponents-client-HttpFuzzer.xml): 100%|██████████| 1/1 [00:00<00:00, 3826.92it/s]
check XML parse (trace-httpcomponents-client-HttpFuzzer.xml): 1it [00:00,  1.53it/s]
chop final fuzzer target call (trace-jackson-

ROUND 2: 5 files failed parsing





2826

In [20]:
import tqdm as tqdm

fuzzer_start = re.compile(r'''<call[^>]*method="[^("]+.fuzzerTestOneInput\([^)]+\)"[^>]*''')
fuzzer_almost_end = re.compile(r'''<tracepoint[^>]+>''')
method_re = re.compile(r'''method="[^("]+.fuzzerTestOneInput\([^)]+\)"''')
exit_re = re.compile(r'''type="exit"''')
fuzzer_real_end = re.compile(r'''</call>''')


from xml.dom import minidom
from xml.parsers.expat import ExpatError

def prettify(rough_string):
    """Return a pretty-printed XML string for the Element.
    """
    reparsed = minidom.parseString(rough_string)
    
    return reparsed.toprettyxml(indent="  ")

def recover_functions(fpath, dstdir):
    with open(fpath) as f:
        num_lines = sum(1 for line in tqdm.tqdm(f, desc=f"count lines ({fpath.name})"))
    with open(fpath) as f:
        it = iter(f)
        repair_path = dstdir/(str(fpath.name) + ".repair.xml")
        failed_functions = 0
        all_functions = 0
        with open(repair_path, "w") as outf:
            with tqdm.tqdm(it, total=num_lines, desc=f"deconstruct into fuzzer target calls ({fpath.name})") as pbar:
                it = iter(pbar)
                fuzz_functions = []
                current_fuzz_function = None
                try:
                    outf.write(next(it))
                    while True:
                        line = next(it)
                        if fuzzer_start.search(line):
                            # print("start at", line)
                            # start fuzzed function
                            current_fuzz_function = []
                        if current_fuzz_function is not None:
                            current_fuzz_function.append(line)
                        m = fuzzer_almost_end.search(line)
                        if m:
                            tag = m.group(0)
                            if method_re.search(tag) and exit_re.search(tag):
                                # print("end at", line)
                                while True:
                                    line = next(it)
                                    # print("search end", line)
                                    current_fuzz_function.append(line)
                                    if fuzzer_real_end.search(line):
                                        # cap off fuzzed function
                                        # print("end", line)
                                        all_functions += 1
                                        try:
                                            func_xml = "".join(current_fuzz_function)
                                            # ET.fromstring(func_xml)
                                            # print(func_xml)
                                            func_xml = "".join(prettify(func_xml).splitlines(keepends=True)[1:])
                                            outf.write(func_xml + "\n")
                                        except (ExpatError, ET.ParseError):
                                            failed_functions += 1
                                        pbar.set_postfix({"all": all_functions, "failed": failed_functions})
                                        break
                except StopIteration:
                    pass
            outf.write("</trace>")

        return repair_path

import tqdm
successes_3 = []
failed_3 = []
with tqdm.tqdm(failed_2, position=1, desc="round 3") as pbar:
    for fpath in pbar:
        repair_path = recover_functions(fpath, dstdir)
        try:
            it = ET.iterparse(repair_path)
            for tag in tqdm.tqdm(it, desc=f"check XML parse ({fpath.name})"):
                pass
            successes_3.append(repair_path)
        except ET.ParseError as ex:
            print("exception", type(ex), ex, "parsing", fpath)
            failed_3.append(repair_path)
        pbar.set_postfix({
            "success": len(successes_3),
            "failed": len(failed_3),
        })

print("ROUND 3:", len(failed_3), "files failed parsing")
(dstdir/"1_repair_success_3.txt").write_text("\n".join(map(str, successes_3)))
failed_3

count lines (trace-groovy-TestFuzzer.xml.repair.xml): 8it [00:00, 31714.96it/s]
deconstruct into fuzzer target calls (trace-groovy-TestFuzzer.xml.repair.xml): 100%|██████████| 8/8 [00:00<00:00, 28268.27it/s]
check XML parse (trace-groovy-TestFuzzer.xml.repair.xml): 1it [00:01,  1.44s/it]
count lines (trace-h2database-ServerLoginFuzzer.xml.repair.xml): 20it [00:00, 44267.06it/s]
deconstruct into fuzzer target calls (trace-h2database-ServerLoginFuzzer.xml.repair.xml): 100%|██████████| 20/20 [00:00<00:00, 71514.13it/s]
check XML parse (trace-h2database-ServerLoginFuzzer.xml.repair.xml): 1it [00:00, 5282.50it/s]
count lines (trace-h2database-ShellFuzzer.xml.repair.xml): 20it [00:00, 186413.51it/s]
deconstruct into fuzzer target calls (trace-h2database-ShellFuzzer.xml.repair.xml): 100%|██████████| 20/20 [00:00<00:00, 150603.38it/s]
check XML parse (trace-h2database-ShellFuzzer.xml.repair.xml): 1it [00:00, 5570.12it/s]
count lines (trace-h2database-SqlPreparedStatementFuzzer.xml.repair.xml):

ROUND 3: 0 files failed parsing





[]

In [21]:
all_files = successes_1 + successes_2 + successes_3

In [22]:
with open(dstdir/"1_repair_success_all.txt", "w") as f:
    f.write("\n".join(map(str, all_files)))