In [7]:
from pathlib import Path
import xml.etree.ElementTree as ET

log_dir = Path("logs")
log_files = list(sorted(log_dir.glob('*.xml')))
print(len(log_files))

errored = 0
for log_file in log_files:
    try:
        tree = ET.parse(log_file)
        trace = tree.getroot()
    except Exception as e:
        errored += 1
        print('Error', log_file, e)
print(f'{errored=}')

1439
Error logs/java_p00002_s334870499.xml no element found: line 1, column 0
Error logs/java_p00002_s338496958.xml no element found: line 1, column 0
Error logs/java_p00002_s365138879.xml no element found: line 1, column 0
Error logs/java_p00002_s374017987.xml no element found: line 1, column 0
Error logs/java_p00002_s376766593.xml no element found: line 1, column 0
Error logs/java_p00002_s792181456.xml no element found: line 302, column 109
Error logs/java_p00002_s802868720.xml unclosed token: line 110, column 0
Error logs/java_p02256_s279856357.xml no element found: line 1, column 0
Error logs/java_p02256_s656638000.xml no element found: line 1, column 0
Error logs/java_p02256_s666464379.xml no element found: line 1, column 0
Error logs/java_p02256_s700014159.xml unclosed token: line 2030, column 0
Error logs/java_p02400_s369612578.xml no element found: line 1, column 0
Error logs/java_p02400_s372051013.xml no element found: line 1, column 0
Error logs/java_p02400_s375626370.xml no 

In [8]:
from pathlib import Path
import xml.etree.ElementTree as ET

source_dir = Path("../Project_CodeNet/mini")
input_dir = Path("../Project_CodeNet/derived/input_output/data")
output_dir = Path("outputs")

sequences = []
skipped = 0
errored = 0
timedout = 0
empty = 0

for log_file in log_files:
    sequence = {}

    try:
        lang, problem, solution = log_file.stem.split('_')
        
        # Add line number to each source line
        if lang == 'c':
            lang_pathelement = 'C'
        elif lang == 'cpp':
            lang_pathelement = 'C++'
        elif lang == 'java':
            lang_pathelement = 'Java'
        source_file = source_dir / problem / lang_pathelement / (solution + '.' + lang)
        with open(source_file) as f:
            lines = f.readlines()
        lines = [l.rstrip() + f'// L{i}' for i, l in enumerate(lines, start=1)]
        sequence["src"] = '\n'.join(lines)

        sequence["filepath"] = str(source_file.relative_to(source_dir))

        # Map line number to variables/values
        state = {}
        output = {}
        tree = ET.parse(log_file)
        trace = tree.getroot()
        if len(trace) == 0:
            empty += 1
        for child in trace:
            if child.tag == 'program_point':
                filename = child.attrib["filename"]
                lineno = child.attrib["line"]
                state[lineno] = my_state = []
                for variable in child:
                    if variable.tag == 'variable':
                        my_state.append((variable.attrib["name"], variable.text))
            if variable.tag == 'timeout':
                timedout += 1
        state_words = []
        for lineno, states in state.items():
            state_words.append(f'L{lineno}')
            for variable in states:
                name, text = variable
                state_words += [name, text]
        sequence["trace"] = ' '.join(state_words)

        # Add input and output
        input_file = input_dir / problem / 'input.txt'
        with open(input_file) as f:
            sequence["input"] = f.read()
        
        output_file = output_dir / (log_file.stem + '.txt')
        with open(output_file) as f:
            sequence["output"] = f.read()

        sequences.append(sequence)
    except Exception as e:
        print('Error', log_file, e)
        errored += 1
        continue
print(empty, 'empty', skipped, 'skipped', errored, 'errored out', timedout, 'timed out')

Error logs/java_p00002_s334870499.xml no element found: line 1, column 0
Error logs/java_p00002_s338496958.xml no element found: line 1, column 0
Error logs/java_p00002_s365138879.xml no element found: line 1, column 0
Error logs/java_p00002_s374017987.xml no element found: line 1, column 0
Error logs/java_p00002_s376766593.xml no element found: line 1, column 0
Error logs/java_p00002_s792181456.xml no element found: line 302, column 109
Error logs/java_p00002_s802868720.xml unclosed token: line 110, column 0
Error logs/java_p02256_s279856357.xml no element found: line 1, column 0
Error logs/java_p02256_s656638000.xml no element found: line 1, column 0
Error logs/java_p02256_s666464379.xml no element found: line 1, column 0
Error logs/java_p02256_s700014159.xml unclosed token: line 2030, column 0
Error logs/java_p02400_s369612578.xml no element found: line 1, column 0
Error logs/java_p02400_s372051013.xml no element found: line 1, column 0
Error logs/java_p02400_s375626370.xml no eleme

In [9]:
import json
print(json.dumps(sequences[0], indent=2))

with open('sequences_java.json', 'w') as f:
    for sequence in sequences:
        json.dump(sequence, f)
        f.write('\n')

{
  "src": "import java.util.Scanner;// L1\n// L2\nclass Main{// L3\n    public static void main(String[] a){// L4\n        Scanner s = new Scanner(System.in);// L5\n// L6\n        while(s.hasNext()){// L7\n            System.out.println(String.valueOf(s.nextInt() + s.nextInt()).length());// L8\n        }// L9\n    }// L10\n}// L11",
  "filepath": "p00002/Java/s021798406.java",
  "trace": "L5 a [] L7 s \"java.util.Scanner[delimiters=\\p{javaWhitespace}+][position=17][match valid=true][need input=false][source closed=false][skipped=false][group separator=\\x{2c}][decimal separator=\\x{2e}][positive prefix=][negative prefix=\\Q-\\E][positive suffix=][negative suffix=][NaN string=\\Q\ufffd\\E][infinity string=\\Q\u221e\\E]\" a [] L8 s \"java.util.Scanner[delimiters=\\p{javaWhitespace}+][position=8][match valid=false][need input=false][source closed=false][skipped=false][group separator=\\x{2c}][decimal separator=\\x{2e}][positive prefix=][negative prefix=\\Q-\\E][positive suffix=][negativ