In [20]:
from pathlib import Path
import xml.etree.ElementTree as ET

log_dir = Path("logs")
log_files = list(sorted(log_dir.glob('*.xml')))
print(len(log_files))

errored = 0
for log_file in log_files:
    try:
        tree = ET.parse(log_file)
        trace = tree.getroot()
    except Exception as e:
        errored += 1
        print('Error', log_file, e)
print(f'{errored=}')

2911
Error logs/c_p00002_s603456841.xml junk after document element: line 3, column 0
Error logs/c_p02256_s256764514.xml junk after document element: line 3, column 0
Error logs/c_p02407_s006462992.xml junk after document element: line 3, column 0
Error logs/c_p02407_s023464383.xml junk after document element: line 3, column 0
Error logs/c_p02407_s044505720.xml junk after document element: line 3, column 0
Error logs/c_p02407_s051625658.xml junk after document element: line 3, column 0
Error logs/c_p02407_s052940288.xml junk after document element: line 3, column 0
Error logs/c_p02407_s076613992.xml junk after document element: line 3, column 0
Error logs/c_p02407_s103975192.xml junk after document element: line 3, column 0
Error logs/c_p02407_s112794580.xml junk after document element: line 3, column 0
Error logs/c_p02407_s131950008.xml junk after document element: line 3, column 0
Error logs/c_p02407_s138919285.xml junk after document element: line 3, column 0
Error logs/c_p02407_s14

In [21]:
from pathlib import Path
import xml.etree.ElementTree as ET

source_dir = Path("../Project_CodeNet/mini")
input_dir = Path("../Project_CodeNet/derived/input_output/data")
output_dir = Path("outputs")

sequences = []
skipped = 0
errored = 0
timedout = 0
empty = 0

for log_file in log_files:
    sequence = {}

    try:
        lang, problem, solution = log_file.stem.split('_')
        
        # Add line number to each source line
        if lang == 'c':
            lang_pathelement = 'C'
        elif lang == 'cpp':
            lang_pathelement = 'C++'
        elif lang == 'java':
            lang_pathelement = 'Java'
        source_file = source_dir / problem / lang_pathelement / (solution + '.' + lang)
        with open(source_file) as f:
            lines = f.readlines()
        lines = [l.rstrip() + f'// L{i}' for i, l in enumerate(lines, start=1)]
        sequence["src"] = '\n'.join(lines)

        sequence["filepath"] = str(source_file.relative_to(source_dir))

        # Map line number to variables/values
        state = {}
        output = {}
        tree = ET.parse(log_file)
        trace = tree.getroot()
        if len(trace) == 0:
            empty += 1
        for child in trace:
            if child.tag == 'program_point':
                filename = child.attrib["filename"]
                lineno = child.attrib["line"]
                state[lineno] = my_state = []
                for variable in child:
                    if variable.tag == 'variable':
                        my_state.append((variable.attrib["name"], variable.text))
            if variable.tag == 'timeout':
                timedout += 1
        state_words = []
        for lineno, states in state.items():
            state_words.append(f'L{lineno}')
            for variable in states:
                name, text = variable
                state_words += [name, text]
        sequence["trace"] = ' '.join(state_words)

        # Add input and output
        input_file = input_dir / problem / 'input.txt'
        with open(input_file) as f:
            sequence["input"] = f.read()
        
        output_file = output_dir / (log_file.stem + '.txt')
        with open(output_file) as f:
            sequence["output"] = f.read()

        sequences.append(sequence)
    except Exception as e:
        print('Error', log_file, e)
        errored += 1
        continue
print(empty, 'empty', skipped, 'skipped', errored, 'errored out', timedout, 'timed out')

Error logs/c_p00002_s123622353.xml 'tuple' object has no attribute 'tag'
Error logs/c_p00002_s210080877.xml 'tuple' object has no attribute 'tag'
Error logs/c_p00002_s603456841.xml junk after document element: line 3, column 0
Error logs/c_p00002_s723987517.xml 'tuple' object has no attribute 'tag'
Error logs/c_p02256_s256764514.xml junk after document element: line 3, column 0
Error logs/c_p02407_s006462992.xml junk after document element: line 3, column 0
Error logs/c_p02407_s023464383.xml junk after document element: line 3, column 0
Error logs/c_p02407_s044505720.xml junk after document element: line 3, column 0
Error logs/c_p02407_s051625658.xml junk after document element: line 3, column 0
Error logs/c_p02407_s052940288.xml junk after document element: line 3, column 0
Error logs/c_p02407_s076613992.xml junk after document element: line 3, column 0
Error logs/c_p02407_s103975192.xml junk after document element: line 3, column 0
Error logs/c_p02407_s112794580.xml junk after docume

In [22]:
import json
print(json.dumps(sequences[0], indent=2))

{
  "src": "#include<stdio.h>// L1\nint main(void)// L2\n{// L3\n    int a,b,c=1,d=0;// L4\n    while(scanf(\"%d %d\",&a,&b)!=EOF){// L5\n        c=1,d=0;// L6\n        while(1){// L7\n            c=10*c;// L8\n            d++;// L9\n            if((a+b)/c==0){// L10\n                printf(\"%d\\n\",d);// L11\n                break;// L12\n            }// L13\n        }// L14\n    }// L15\n    return 0;// L16\n}// L17",
  "filepath": "p00002/C/s004013345.c",
  "trace": "L3 a -2012475232 b 21956 c 1476263456 d 32766 L4 a -2012475232 b 21956 c 1476263456 d 32766 L5 a 1000 b 999 c 10000 d 4 L6 a 1000 b 999 c 1000 d 3 L8 a 1000 b 999 c 1000 d 3 L9 a 1000 b 999 c 10000 d 3 L10 a 1000 b 999 c 10000 d 4 L11 a 1000 b 999 c 10000 d 4 L16 a 1000 b 999 c 10000 d 4 L17 a 1000 b 999 c 10000 d 4",
  "input": "5 7\n1 99\n1000 999\n",
  "output": "2\n3\n4\n"
}


In [23]:
with open('sequences_c_cpp.json', 'w') as f:
    for sequence in sequences:
        json.dump(sequence, f)
        f.write('\n')