In [12]:
from pathlib import Path
import xml.etree.ElementTree as ET

log_dir = Path("logs")
log_files = list(sorted(log_dir.glob('*.xml')))
print(len(log_files))

4


In [53]:
from pathlib import Path
import xml.etree.ElementTree as ET
import traceback

source_dir = Path("../Project_CodeNet/mini")
input_dir = Path("../Project_CodeNet/derived/input_output/data")
output_dir = Path("outputs")

sequences = []
skipped = 0
errored = 0
timedout = 0
empty = 0

for log_file in log_files:
    sequence = {}

    try:
        lang, problem, solution = log_file.stem.split('_')
        
        # Add line number to each source line
        if lang == 'c':
            lang_pathelement = 'C'
        elif lang == 'cpp':
            lang_pathelement = 'C++'
        elif lang == 'java':
            lang_pathelement = 'Java'
        source_file = source_dir / problem / lang_pathelement / (solution + '.' + lang)
        with open(source_file) as f:
            lines = f.readlines()
        lines = [l.rstrip() + f'// L{i}' for i, l in enumerate(lines, start=1)]
        sequence["src"] = '\n'.join(lines)

        sequence["filepath"] = str(source_file.relative_to(source_dir))

        # Map line number to variables/values
        output = {}
        tree = ET.parse(log_file)
        trace = tree.getroot()
        if len(trace) == 0:
            empty += 1
        current_lineno = None
        states = []
        current_state = []
        for child in trace:
            if child.tag == 'program_point':
                filename = child.attrib["filename"]
                lineno = int(child.attrib["line"]) - 1
                for variable in child:
                    if variable.tag == 'variable':
                        age = variable.attrib["type"]
                        if age in ('new', 'modified'):
                            current_state.append((age, 'var:', variable.attrib["name"], '=', variable.text))
            if lineno != current_lineno:
                states.append((lineno, current_state))
                current_state = []
            current_lineno = lineno
        states.append((lineno, current_state))
        state_words = []
        for lineno, states in states:
            if any(states):
                state_words.append(f'L{lineno}')
                for variable in states:
                    state_words += variable
        sequence["trace"] = ' '.join(state_words)

        # Add input and output
        input_file = input_dir / problem / 'input.txt'
        with open(input_file) as f:
            sequence["input"] = f.read()
        
        output_file = output_dir / (log_file.stem + '.txt')
        with open(output_file) as f:
            sequence["output"] = f.read()

        sequences.append(sequence)
    except Exception:
        print('Error', log_file)
        traceback.print_exc()
        errored += 1
        continue
print(empty, 'empty', skipped, 'skipped', errored, 'errored out', timedout, 'timed out')

0 empty 0 skipped 0 errored out 0 timed out


In [54]:
import json
print(json.dumps(sequences, indent=2))

[
  {
    "src": "#include <iostream>// L1\n#include <algorithm>// L2\n#include <cstdio>// L3\n#include <string>// L4\n#include <sstream>// L5\n// L6\nusing namespace std;// L7\n// L8\nint main(){// L9\n    int a, b;// L10\n    stringstream s;// L11\n    while(cin >> a >> b){// L12\n    s << a + b;// L13\n    cout << s.str().size() << endl;// L14\n    }// L15\n    return 0;// L16\n}// L17",
    "filepath": "p00002/C++/s011509553.cpp",
    "trace": "L8 new var: a = -55208960 new var: b = -28222057 new var: s = <error> L11 modified var: s = \"\" L12 modified var: a = 5 modified var: b = 7 L13 modified var: s = \"12\" L12 modified var: a = 1 modified var: b = 99 L13 modified var: s = \"12100\" L12 modified var: a = 1000 modified var: b = 999 L13 modified var: s = \"121001999\"",
    "input": "5 7\n1 99\n1000 999\n",
    "output": "2\n5\n9\n"
  },
  {
    "src": "#include<iostream>// L1\nusing namespace std;// L2\n// L3\nint main(){// L4\n\tint count;// L5\n\tint num1,num2,sum;// L6\n// L7

In [15]:
with open('sequences_c_cpp.json', 'w') as f:
    for sequence in sequences:
        json.dump(sequence, f)
        f.write('\n')