# Generating refactored corpus
Modify the synthetic benchmarks to generate a corpus of refactored programs.
Generate 1 refactored program for each benchmark version.

In [5]:
%load_ext autoreload
%autoreload 2
import refactorings
from pathlib import Path
import difflib
import random
import shutil
import subprocess
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load projects
Input data documented in `all.csv` and filtered in code.

Parse for ground truth.
Ground truth means the flaw/fix locations (file/lineno) documented in the benchmark.
Only parse it for buggy samples since that's all that's needed for refactoring.

In [6]:
from collections import defaultdict
import functools
import pandas as pd
from pathlib import Path
import re

def corebench_manual():
    result = defaultdict(functools.partial(defaultdict, list))
    df = pd.read_csv('nb/corebench_manual_groundtruth.tsv', delimiter='\t')
    for _, row in df.iterrows():
        for buggy_line in row["Line of Bug Crash"].split(','):
            buggy_line = int(buggy_line)
            result[row["Project"]][row["Buggy"]].append((row["File of Bug"], buggy_line))
    # Convert to a normal dict
    result = dict(result)
    for k in result:
        result[k] = dict(result[k])
    return result

def dbgbench(projects):
    dbgbench = Path('dbgbench.github.io')
    faultstxts = dbgbench.glob('*.faults.txt')
    result = defaultdict(functools.partial(defaultdict, list))
    for faultstxt in faultstxts:
        project_name, ok = faultstxt.name.split('.')[:2]
        if project_name == 'find':
            project_name = 'findutils'
        project = next((p for p in projects if p.program == project_name and p.ok == ok), None)
        if project is not None:
            with open(faultstxt) as f:
                for l in f.readlines():
                    filename, lineno = l.split(':')
                    lineno = int(lineno)
                    result[project.program][project.buggy].append((filename, lineno))
    # Convert to a normal dict
    result = dict(result)
    for k in result:
        result[k] = dict(result[k])
    return result

def synthetic(project):
    flaws = set()
    files = list(project.buggy_path.glob('*/*/*.c') + project.buggy_path.glob('*/*.c') + project.buggy_path.glob('*.c'))
    assert len(files) > 0, project.buggy_path
    for fname in files:
        with open(fname) as f:
            for i, line in enumerate(f.readlines(), start=1):
                if project.program in 'abm':
                    if re.search(r'/\*\s*BAD\s*\*/', line):
                        flaws.add((fname, int(i)))
                elif project.program == 'zitser':
                    if re.search(r'/\*\s*BAD\s*\*/', line):
                        flaws.add((fname, int(i+1)))
                elif project.program == 'ctestsuite':
                    if re.search(r'/\*\s*FLAW\s*\*/', line):
                        flaws.add((fname, int(i)))
                elif project.program == 'toyota':
                    if re.search(r'/\*\s*ERROR', line):
                        flaws.add((fname, int(i)))
    return flaws

def get_all_groundtruth(projects):
    corebench_groundtruth = {}
    corebench_groundtruth.update(corebench_manual())
    corebench_groundtruth.update(dbgbench(projects))
    for proj in projects:
        if proj.program in corebench_groundtruth:
            flaws = corebench_groundtruth[proj.program][proj.buggy]
        else:
            flaws = synthetic(proj)
        # print(proj.program, proj.buggy, len(flaws))
        proj.flaws = flaws
    return projects


Parse project info into neat dataclasses.
Assumes all projects are in a folder named `tests`.

In [7]:
import pandas as pd
from dataclasses import dataclass, field
from path import Path

code_root = Path('tests')

@dataclass(unsafe_hash=True)
class ProjectInfo:
    program: str = field(hash=True)
    buggy: str = field(hash=True)
    ok: str = field(hash=True)

    def __post_init__(self):
        if self.program == 'find':
            self.program = 'findutils'
        self.init_paths()
        self.replace_unicode()
    
    @property
    def versions(self):
        return self.buggy, self.ok
    
    @property
    def versions_with_paths(self):
        return zip((self.buggy, self.ok), (self.buggy_path, self.ok_path))

    def init_paths(self) -> str:
        self.program_path = code_root / self.program
        self.buggy_path = self.program_path / self.buggy
        self.ok_path = self.program_path / self.buggy
        if self.program in ['coreutils', 'findutils', 'grep', 'make']:
            self.buggy_path = self.buggy_path / self.program
            self.ok_path = self.ok_path / self.program
        assert self.buggy_path.exists(), self.buggy_path
        assert self.ok_path.exists(), self.ok_path

    def replace_unicode(self):
        files = list(self.buggy_path.glob('**/*.c') + self.buggy_path.glob('*.c') + self.ok_path.glob('**/*.c') + self.ok_path.glob('*.c'))
        for fname in files:
            with open(fname, 'r', encoding='utf-8', errors='replace') as f:
                text = f.read()
            with open(fname, 'w', encoding='utf-8') as f:
                f.write(text)

def get_iter(project_filter):
    df = pd.read_csv('nb/all.csv')
    
    for _, row in df.iterrows():
        if row["Program"] in project_filter:
            yield ProjectInfo(row["Program"], row["Intro Commit ID"], row["Fixed Commit ID"])

def get_projects(project_filter):
    projects = list(get_iter(project_filter))
    projects = get_all_groundtruth(projects)
    projects = sorted(projects, key=lambda p: str(p.program_path))
    return projects


Filter project info

In [8]:
corebench_project_names = ['make', 'grep', 'coreutils', 'findutils']
project_filter = []
project_filter += ['abm']
project_filter += ['ctestsuite']
project_filter += ['zitser']
project_filter += ['toyota']
# project_filter += corebench_project_names
projects = get_projects(project_filter)
len(projects)

127

# Refactor code

Hyperparameters:
- Random seed: should be reset before applying each refactoring.
- `num_iterations`: Number of transformations to do
- Which transforms to apply (see `refactoring`)

In [36]:
from contextlib import redirect_stdout
from tqdm import tqdm_notebook
from pathlib import Path
import pandas as pd
import re
from shutil import ignore_patterns
import os

# Experiment parameters
picker = refactorings.random_picker
transforms = refactorings.all_refactorings
factory = refactorings.TransformationsFactory(transforms, picker)

# Zonk existing log files
logfile = Path('nb/log.txt')
if logfile.exists():
    logfile.unlink()
errors_log = Path('errors.log')
if errors_log.exists():
    errors_log.unlink()


def get_exclude(bench_name, project, c_file):
    """Return the exclude pattern given a benchmark name, project directory and target C file."""
    srcfiles = ['*.c.diff', '*.c.back', '*.c.refactor', '*.c.transforms']
    if bench_name == 'toyota':
        for f in project.glob('src/*.c'):
            if f.name == 'main.c' or f.name == c_file.name:
                continue
            else:
                srcfiles.append(f.name)
    return ignore_patterns(*srcfiles)


def proj_and_files(proj):
    result = set()
    for f, _ in proj.flaws:
        result.add((proj, f))
    return list(result)


print(f'Redirecting stdout to {logfile}')
for project_name in sorted(list(set(p.program for p in projects))):
    projects_and_files = []
    for p in projects:
        if p.program == project_name:
            result = set()
            for f, _ in p.flaws:
                result.add((p, f))
            projects_and_files.extend(result)
    projects_and_files = sorted(projects_and_files, key=lambda p: str(p[1]))
    for proj, c_file in tqdm_notebook(projects_and_files, desc=project_name):
        avoid = []
        for f, lineno in proj.flaws:
            if f == c_file:
                avoid.append(lineno)
        assert len(avoid) > 0
        random.seed(0)
        with open(logfile, 'a') as f:
            with redirect_stdout(f):
                exclude = get_exclude(proj.program, proj.buggy_path, c_file)
                with factory.make_project(c_file, proj.buggy_path, exclude, avoid=avoid) as refactoring_project:
                    print('***REFACTORING***', refactoring_project, c_file)
                    tmp_c_file, transforms_applied = refactoring_project.apply_all(return_applied=True)
                    new_c_file = Path(str(c_file) + '.refactor')
                    shutil.copy(tmp_c_file, new_c_file)
                    with open(c_file.parent / (c_file.name + '.transforms'), 'w') as f:
                        f.write('\n'.join(t.__name__ for t in transforms_applied))
                    with open(c_file) as old_f, open(new_c_file) as new_f:
                        diff = list(difflib.unified_diff(old_f.readlines(), new_f.readlines(), fromfile=c_file.name, tofile=c_file.name))
                    with open(str(c_file) + '.diff', 'w') as f:
                        f.write(''.join(diff))

Redirecting stdout to log.txt


HBox(children=(IntProgress(value=0, description='toyota', max=46, style=ProgressStyle(description_width='initi…




In [41]:
import pandas as pd
import difflib

all_transform_names = sorted([t.__name__ for t in refactorings.all_refactorings])

with open('nb/stats.csv', 'w') as csv_f:
    print(','.join(('project', 'version', *all_transform_names, 'lines changed')), file=csv_f)
    for proj in projects:
        # Get files and check they exist
        try:
            transforms_file = next(proj.buggy_path.walk('*.c.transforms'))
            new_file = transforms_file.with_suffix('.refactor')
            old_file = new_file.parent / new_file.stem
        except:
            raise Exception(f'{proj.program}-{proj.buggy}')
        assert transforms_file.exists()
        assert new_file.exists(), new_file
        assert old_file.exists(), old_file

        # Collect which transforms were applied
        with open(transforms_file) as f:
            transforms_applied = set(f.read().splitlines())
        was_applied = [t in transforms_applied for t in all_transform_names]

        # Collect number of changed lines
        differences = sum(1 for d in difflib.ndiff(old_file.open().readlines(), new_file.open().readlines()) if d[0] in ('+', '-'))

        print(','.join((proj.program, proj.buggy, *('TRUE' if a else 'FALSE' for a in was_applied), str(differences))), file=csv_f)

In [39]:
import subprocess
import pandas as pd
from pathlib import Path
import tqdm.notebook as tqdm
import shutil
import re

exclude = []
failed = []
pbar = tqdm.tqdm(projects)
for proj in pbar:
    project = proj.program
    code_dir = proj.buggy_path
    assert code_dir.exists()

    # Replace refactored file
    proj_files = {f for f, _ in proj.flaws}
    proj_files = list(sorted(proj_files))
    for original in proj_files:
        pbar.set_postfix({'excluded': len(exclude), 'failed': len(failed), 'file': original})
        refactored = original.with_suffix('.c.refactor')
        backup = refactored.with_suffix('.back')
        assert refactored.exists(), refactored
        assert original.exists(), original
        if not backup.exists():
            shutil.copy2(original, backup)
        assert backup.exists(), backup
        
        # Fix up the makefile which might reference the *-ok/*-bad filenames
        makefile = refactored.parent / 'Makefile'
        assert makefile.exists()
        makefile_backup = refactored.parent / 'Makefile.okbad'
        if makefile_backup.exists():
            shutil.copy2(makefile_backup, makefile)
        else:
            shutil.copy2(makefile, makefile_backup)
        with open(makefile) as f:
            text = f.read()
        text = re.sub(r'(\w+)-ok', r'\1', text)
        text = re.sub(r'(\w+)-bad', r'\1', text)
        if (proj.program, proj.buggy) in (('ctestsuite', '095'), ('ctestsuite', '097'), ('ctestsuite', '099'), ('ctestsuite', '187')):
            text = text.replace('$(CC) $(CFLAGS) $(SRC)', '$(CC) $(CFLAGS) $(SRC) $(shell mysql_config --cflags --libs)')
        if (proj.program, proj.buggy) in (('zitser', '287'), ('zitser', '289'), ('zitser', '295')):
            text = text.replace('create\n', './create\n')
        with open(makefile, 'w') as f:
            f.write(text)

        # Build and check
        proc = subprocess.run('make clean', cwd=str(code_dir), capture_output=True, shell=True)
        proc = subprocess.run('make', cwd=str(code_dir), capture_output=True, shell=True)
        if proc.returncode != 0:
            exclude.append((project, proj.buggy, proc.stderr.decode()))
            continue
        
        try:
            shutil.copy(refactored, original)
            proc = subprocess.run('make clean', cwd=str(code_dir), capture_output=True, shell=True)
            proc = subprocess.run('make', cwd=str(code_dir), capture_output=True, shell=True)
            if proc.returncode != 0:
                failed.append((project, proj.buggy, proc.stderr.decode()))
        finally:
            shutil.copy(backup, original)

# install mysql-devel and pam-devel first.
# Some projects in ctestsuite won't build.
print(len(exclude), 'versions didn\'t build')
with open('nb/exclude.csv', 'w') as f:
    f.write(f'project,version\n')
    for project, version, e in exclude:
        f.write(f'{project},{version}\n')
with open('nb/exclude.log', 'w') as f:
    for project, version, e in exclude:
        f.write(f'***tests/{project}/{version}***\n')
        f.write(f'{e}\n')

print(len(failed), 'versions failed after refactoring')
with open('nb/failed.csv', 'w') as f:
    f.write(f'project,version\n')
    for project, version, e in failed:
        f.write(f'{project},{version}\n')
with open('nb/failed.log', 'w') as f:
    for project, version, e in failed:
        f.write(f'***tests/{project}/{version}***\n')
        f.write(f'{e}\n')

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


0 versions didn't build
1 versions failed after refactoring


To pack up refactorings, use `tar cf refactors.tar $(find itc -name '*.c.reformat' -o -name '*.c.diff' -o -name '*.transforms.txt')`.