# Generating refactored corpus
Modify the synthetic benchmarks to generate a corpus of refactored programs.
Generate 1 refactored program for each benchmark version.

In [1]:
%load_ext autoreload
%autoreload 2
import refactorings
from pathlib import Path
import difflib
import random
import pandas as pd
import shutil
import subprocess
import random

## Begin experiment

Subject benchmarks:
- ABM
- C Test Suite

Hyperparameters:
- random seed
- num_iterations: Number of transformations to do
- transforms:


In [14]:
from contextlib import redirect_stdout
from tqdm import tqdm_notebook
from collections import defaultdict

# Experiment parameters
picker = refactorings.random_picker
transforms = refactorings.all_refactorings

factory = refactorings.TransformationsFactory(transforms, picker)

def store_file(new, old):
    shutil.copy(new, old)
    diff = list(difflib.unified_diff(old.open().readlines(), new.open().readlines(), fromfile=c_file.name, tofile=c_file.name))
    with open(str(c_file) + '.diff', 'w') as f:
        f.write(''.join(diff))

def get_file(project):
    c_files = list(project.glob('**/*.c'))
    c_files = [c for c in c_files if not c.name.endswith('.formatted.c') and not c.name.endswith('.new.c')]
    assert len(c_files) >= 1, f'No C files found in {project}'

    if len(c_files) == 1:
        c_file = c_files[0]
    elif len(c_files) > 1:
        # print(c_files)
        for fpath in c_files:
            with fpath.open() as f:
                text = f.read()
            import re
            if re.search(r'/\*\s*BAD\s*\*/', text) or re.search(r'/\*\s*FLAW\s*\*/', text):
                c_file = fpath
    return c_file

# This file contains all the buggy versions from the synthetic benchmarks with the format "project-version".
# One sample per line.
# df = pd.read_csv('logs/synthetic-samples.csv', dtype=str)
# df = pd.read_csv('logs/toyota.csv', dtype=str)
df = pd.read_csv('logs/zitser.csv', dtype=str)
# df = pd.read_csv('failed-crlf-real.csv', dtype=str)
samples = list(zip(df["project"], df["version"]))
tests = Path('tests')
all_projects = [tests / p / v for p,v in samples]
samples_by_project = defaultdict(list)
for p, v in samples:
    project = tests / p / v
    assert project.exists()
    if 'itc' in str(project):
        for c_file in project.glob('**/*.c'):
            samples_by_project[p].append((project,c_file))
    else:
        c_file = get_file(project)
        samples_by_project[p].append((project,c_file))

logfile = Path('log.txt')
if logfile.exists():
    logfile.unlink()
if Path('errors.log').exists():
    Path('errors.log').unlink()

failed_df = pd.read_csv('logs/manual.csv', dtype=str)

def get_exclude(bench_name, project, c_file):
    """Return the exclude pattern given a benchmark name, project directory and target C file."""
    # print(bench_name, project, c_file)
    if bench_name == 'itc':
        from shutil import ignore_patterns
        srcdir = project / 'src'
        srcfiles = list(str(f.relative_to(srcdir)) for f in srcdir.glob('*.c') if f.name != c_file.name)
        # print(srcfiles)
        return ignore_patterns(*srcfiles)
    else:
        return None

# projects = [p for p in projects if s == 'ctestsuite' and p.name == '111']
# projects = [p for p in projects if any(s == row["project"] and p.name == row["version"] for i, row in failed_df.iterrows())]
print(f'Redirecting stdout to {logfile}')
for benchmark_name, projects_and_files in samples_by_project.items():
    for project, c_file in tqdm_notebook(projects_and_files, desc=b):
        random.seed(0)
        with open(logfile, 'a') as f:
            with redirect_stdout(f):
                exclude = get_exclude(benchmark_name, project, c_file)
                shutil.copy(c_file, c_file.parent / (c_file.name + '.back'))
                with factory.make_project(c_file, project, exclude) as project:
                    print('***REFACTORING***', project, c_file)
                    tmp_c_file = project.apply_all()
                    new_c_file = Path(str(c_file) + '.reformat')
                    store_file(tmp_c_file, c_file)

Loaded (['insert_noop', 'loop_exchange', 'permute_stmt', 'rename_variable', 'switch_exchange'])


Redirecting stdout to log.txt


HBox(children=(IntProgress(value=0, description='zitser', max=14, style=ProgressStyle(description_width='initi…




In [11]:
import pandas as pd
import difflib

all_transform_names = sorted([t.__name__ for t in transforms])

with open('logs/stats.csv', 'w') as csv_f:
    print(','.join(('project', 'version', *all_transform_names, 'lines changed')), file=csv_f)

    tests = Path('tests')
    df = pd.read_csv('logs/synthetic-samples.csv', dtype=str)
    for i, row in df.iterrows():
        # Get files and check they exist
        try:
            home = tests / row["project"] / str(row["version"])
            transforms_file = next(home.glob('*.transforms.txt'))
            old_file = next(transforms_file.parent.glob('*.c.reformat'))
            new_file = old_file.parent / old_file.stem
            assert transforms_file.exists()
            assert old_file.exists()
            assert new_file.exists()
        except:
            print('Error', row["project"], row["version"])
            raise

        # Collect which transforms were applied
        with open(transforms_file) as f:
            transforms_applied = set(f.read().splitlines())
        was_applied = [t in transforms_applied for t in all_transform_names]

        # Collect number of changed lines
        differences = sum(1 for d in difflib.ndiff(old_file.open().readlines(), new_file.open().readlines()) if d[0] in ('+', '-'))

        print(','.join((row["project"], str(row["version"]), *('TRUE' if a else 'FALSE' for a in was_applied), str(differences))), file=csv_f)

In [20]:
import subprocess
import pandas as pd
from pathlib import Path
import tqdm
import shutil

# df = pd.read_csv('logs/synthetic-samples.csv', dtype=str)
df = pd.read_csv('logs/zitser.csv', dtype=str)
# df = pd.read_csv('manual.csv', dtype=str)
tests = Path('tests')
exclude = []
failed = []
for i, row in tqdm.tqdm_notebook(list(df.iterrows())):
    project, version = row["project"], row["version"]
    code_dir = tests / project / version
    assert code_dir.exists()

    refactored = next(code_dir.glob('*.c.reformat'))
    original = refactored.parent / refactored.stem
    backup = refactored.parent / (refactored.stem + '.back')
    assert refactored.exists(), refactored
    assert original.exists(), original
    assert backup.exists(), backup
    
    makefile = refactored.parent / 'Makefile'
    assert makefile.exists()
    makefile_backup = refactored.parent / 'Makefile.okbad'
    if not makefile_backup.exists():
        shutil.copy2(makefile, makefile_backup)
    with open(makefile) as f:
        text = f.read()
    import re
    text = re.sub(r'(\w+)-ok', r'\1', text)
    text = re.sub(r'(\w+)-bad', r'\1', text)
    with open(makefile, 'w') as f:
        f.write(text)

    # Check return code
    proc = subprocess.run('make clean', cwd=str(code_dir), shell=True)
    proc = subprocess.run('make', cwd=str(code_dir), capture_output=True, shell=True)
    if proc.returncode != 0:
        exclude.append((project, version, proc.stderr.decode()))
        continue
    
    try:
        shutil.copy(refactored, original)
        original.touch()
        proc = subprocess.run('make clean', cwd=str(code_dir), shell=True)
        proc = subprocess.run('make', cwd=str(code_dir), capture_output=True, shell=True)
        if proc.returncode != 0:
            failed.append((project, version, proc.stderr.decode()))
    finally:
        shutil.copy(backup, original)

# install mysql-devel and pam-devel first.
# Some projects in ctestsuite won't build.
print(len(exclude), 'versions didn\'t build')
with open('logs/exclude.csv', 'w') as f:
    f.write(f'project,version\n')
    for project, version, e in exclude:
        f.write(f'{project},{version}\n')
with open('logs/exclude.txt', 'w') as f:
    for project, version, e in exclude:
        f.write(f'***tests/{project}/{version}***\n')
        f.write(f'{e}\n')

print(len(failed), 'versions failed after refactoring')
with open('logs/failed.csv', 'w') as f:
    f.write(f'project,version\n')
    for project, version, e in failed:
        f.write(f'{project},{version}\n')
with open('logs/failed.log', 'w') as f:
    for project, version, e in failed:
        f.write(f'***tests/{project}/{version}***\n')
        f.write(f'{e}\n')

HBox(children=(IntProgress(value=0, max=14), HTML(value='')))


1 versions didn't build
1 versions failed after refactoring


To pack up refactorings, use `tar cf refactors.tar $(find itc -name '*.c.reformat' -o -name '*.c.diff' -o -name '*.transforms.txt')`.

# Look for ground truth
Ground truth means:
1. Buggy and OK versions both have some flaw location matching with the meta-alerts in the DB
2. Of the meta-alerts matched to flaw locations, buggy and OK should share conditions for at least 1 meta-alert.

In [24]:
import projectslib.projects
import projectslib.corebench_groundtruth
import re
# Analyze ground truth

corebench_project_names = ['make', 'grep', 'coreutils', 'findutils']
project_filter = []
project_filter += ['abm', 'ctestsuite', 'zitser', 'toyota']
# project_filter += corebench_project_names
projects = projectslib.projects.get(project_filter)
print(len(projects))

corebench_manual_groundtruth = projectslib.corebench_groundtruth.get(projects)
for proj in projects:
    if proj.program in corebench_manual_groundtruth:
        flaws = corebench_manual_groundtruth[proj.program][proj.buggy]
    else:
        flaws = set()
        files = list(proj.buggy_path.glob('**/*.c') + proj.buggy_path.glob('*.c'))
        assert len(files) > 0, proj.buggy_path
        for fname in files:
            with open(fname) as f:
                for i, line in enumerate(f.readlines(), start=1):
                    if proj.program in 'abm':
                        if re.search(r'/\*\s*BAD\s*\*/', line):
                            flaws.add((fname, int(i)))
                    elif proj.program == 'zitser':
                        if re.search(r'/\*\s*BAD\s*\*/', line):
                            flaws.add((fname, int(i+1)))
                    elif proj.program == 'ctestsuite':
                        if re.search(r'/\*\s*FLAW\s*\*/', line):
                            flaws.add((fname, int(i)))
                    elif proj.program == 'toyota':
                        if re.search(r'/\*\s*ERROR', line):
                            flaws.add((fname, int(i)))


127
