# Generating refactored corpus
Modify the synthetic benchmarks to generate a corpus of refactored programs.
Generate 1 refactored program for each benchmark version.

In [9]:
!pytest refactorings/ -s

































































platform linux -- Python 3.7.4, pytest-5.2.1, py-1.8.0, pluggy-0.13.0
rootdir: /mnt/Hydrogen_Volume/benjis/work/transform
plugins: openfiles-0.4.0, remotedata-0.3.2, arraydiff-0.3, doctestplus-0.4.0, importnb-0.7.0, anyio-3.2.1
[1mcollecting ... [0mLoaded (['insert_noop', 'loop_exchange', 'permute_stmt', 'rename_variable', 'switch_exchange'])
collected 5 items                                                              [0m[1m

refactorings/test_refactorings.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m



In [10]:
import refactorings as transformations
import importlib
importlib.reload(transformations)

Loaded (['insert_noop', 'loop_exchange', 'permute_stmt', 'rename_variable', 'switch_exchange'])


<module 'refactorings' from '/mnt/Hydrogen_Volume/benjis/work/transform/refactorings/__init__.py'>

In [11]:
from pathlib import Path
import difflib
import random
from lxml import etree as et
import pandas as pd
import shutil
import subprocess
import random
import copy
import traceback
import datetime

In [12]:
def reformat(old_lines, new_file):
    import difflib
    differ = difflib.Differ()
    with open(new_file) as f:
        new_lines = f.readlines()
    diffs = differ.compare(old_lines, new_lines)
    r = []
    line_nums = []
    lineno = 0
    for line in diffs:
        if line[0] in (' ', '+'):
            lineno += 1
        if line[0] == '+':
            if len(r) == 0 or r[-1] == lineno-1:
                r.append(lineno)
            else:
                line_nums.append(r)
                r = [lineno]
    line_nums.append(r)
    if len(line_nums) == 0:
        return old_lines
    lines = []
    for r in line_nums:
        if len(r) == 0:
            break
        if len(r) == 1:
            r = [r[0], r[0]]
        lines.append(f'--lines={r[0]}:{r[-1]}')
    style='-style="{BasedOnStyle: llvm, IndentWidth: 4}"'
    proc = subprocess.run(f'clang-format {" ".join(lines)} {new_file} -i {style}', shell=True, capture_output=True)
    if proc.returncode != 0:
        print(proc.stderr.decode())
    return new_file
    
def c2c(project, c_filename, transforms, picker, num_iterations):
    """Do C source-to-source translation"""
    transforms = copy.deepcopy(transforms)
    transform_filename = c_filename.parent / (c_filename.stem + '.transforms.txt')
    tmp_dir = Path('c2c')
    tmp_dir.mkdir(exist_ok=True)
    tmp_project = tmp_dir / project.name
    if tmp_project.exists():
        shutil.rmtree(tmp_project)
    shutil.copytree(project, tmp_project)
    tmp_c_filename = tmp_project / c_filename.relative_to(project)
    shutil.copy(c_filename, tmp_c_filename)
    info = {"project": tmp_c_filename.parent}

    transforms_applied = []
    def log_transforms_applied():
        """Log after each transform, for debugging in case the procedure is errored."""
        with open(transform_filename, 'w') as f:
            for t in transforms_applied:
                f.write(f'{t.__name__}\n')

    # Apply num_iterations transforms, chosen randomly
    # i = 0
    # while i < num_iterations:
    for i in range(len(transforms)):
        if len(transforms) == 0:
            print('Quitting early, ran out of transforms')
            break
        # t = random.choice(transforms)
        t = transforms[0]
        try:
            if t == transformations.switch_exchange:
                with open(tmp_c_filename) as f:
                    old_lines = f.readlines()

            new_lines = t(tmp_c_filename, picker=picker, info=info)
            if new_lines == None:
                print(f'Could not apply {t.__name__}.')
                transforms.remove(t)
                continue
            with open(tmp_c_filename, 'w') as f:
                f.writelines(new_lines)
            i += 1
            print('Applied', t.__name__)
            transforms.remove(t)
            transforms_applied.append(t)
            log_transforms_applied()

            if t == transformations.switch_exchange:
                tmp_c_filename = reformat(old_lines, tmp_c_filename)
        except Exception as e:
            print(f'Error applying {t.__name__}: {e}. Stack trace written to errors.log.')
            with open('errors.log', 'a') as f:
                print(f'***Exception {project} {c_filename} {t.__name__} ({datetime.datetime.now()})***', e, file=f)
                print(traceback.format_exc(), file=f)
            transforms.remove(t)
    return Path(tmp_c_filename)

if __name__ == '__main__':
    c_file = Path('tests/testbed/testbed.c')
    new_filename = c2c(c_file.parent, c_file, [
        transformations.insert_noop,
        transformations.switch_exchange,
        transformations.loop_exchange,
        transformations.rename_variable,
        transformations.permute_stmt,
    ], lambda a: a[0], 5)
    print(new_filename)

Applied insert_noop
Applied switch_exchange
Applied loop_exchange
Applied rename_variable
Applied permute_stmt
c2c/testbed/testbed.c


## Begin experiment

Subject benchmarks:
- ABM
- C Test Suite

Hyperparameters:
- random seed
- num_iterations: Number of transformations to do
- transforms:


In [14]:
from contextlib import redirect_stdout
from tqdm import tqdm_notebook
from collections import defaultdict
importlib.reload(transformations)

# Experiment parameters
def picker(collection):
    assert len(collection) > 0, 'Collection is empty'
    return random.choice(collection)
num_iterations = None
transforms = [
    transformations.loop_exchange,
    transformations.switch_exchange,
    transformations.permute_stmt,
    transformations.rename_variable,
    transformations.insert_noop,
]
def run_exp(project, c_file):
    c_file = Path(c_file)
    return c2c(project, c_file, transforms=transforms, num_iterations=num_iterations, picker=picker)

# This file contains all the buggy versions from the synthetic benchmarks with the format "project-version".
# One sample per line.
df = pd.read_csv('logs/synthetic-samples.csv', dtype=str)
# df = pd.read_csv('failed-crlf-real.csv', dtype=str)
samples = list(zip(df["project"], df["version"]))
tests = Path('tests')
all_projects = [tests / p / v for p,v in samples]
samples_by_project = defaultdict(list)
for p, v in samples:
    samples_by_project[p].append(tests / p / v)

logfile = Path('log.txt')
if logfile.exists():
    logfile.unlink()
if Path('errors.log').exists():
    Path('errors.log').unlink()

def store_file(old, new):
    shutil.copy(tmp_c_file, new_c_file)
    diff = list(difflib.unified_diff(c_file.open().readlines(), new_c_file.open().readlines(), fromfile=c_file.name, tofile=c_file.name))
    with open(str(c_file) + '.diff', 'w') as f:
        f.write(''.join(diff))

def get_file(project):
    c_files = list(project.glob('*.c'))
    c_files = [c for c in c_files if not c.name.endswith('.formatted.c') and not c.name.endswith('.new.c')]
    assert len(c_files) >= 1, f'No C files found in {project}'

    if len(c_files) == 1:
        c_file = c_files[0]
    elif len(c_files) > 1:
        for fpath in c_files:
            with fpath.open() as f:
                text = f.read()
            if '/* BAD */' in text or '/* FLAW */' in text:
                c_file = fpath
    return c_file

# import pandas as pd
# failed_df = pd.read_csv('logs/failed.csv', dtype=str)

# for project in [p for p in all_projects if '575' in str(p)]:
for s, projects in samples_by_project.items():
    # projects = [p for p in projects if s == 'ctestsuite' and p.name == '111']
    # projects = [p for p in projects if any(s == row["project"] and p.name == row["version"] for i, row in failed_df.iterrows())]
    for project in tqdm_notebook(projects, desc=s):
        assert project.exists()
        random.seed(0)
        c_file = get_file(project)
        with open(logfile, 'a') as f:
            with redirect_stdout(f):
                print(project)
                tmp_c_file = run_exp(project, c_file)
        new_c_file = Path(str(c_file) + '.reformat')
        store_file(c_file, tmp_c_file)

Loaded (['insert_noop', 'loop_exchange', 'permute_stmt', 'rename_variable', 'switch_exchange'])


HBox(children=(IntProgress(value=0, description='abm', max=25, style=ProgressStyle(description_width='initial'…






HBox(children=(IntProgress(value=0, description='ctestsuite', max=87, style=ProgressStyle(description_width='i…






In [18]:
import pandas as pd
import difflib

all_transform_names = sorted([t.__name__ for t in transforms])

with open('logs/stats.csv', 'w') as csv_f:
    print(','.join(('project', 'version', *all_transform_names, 'lines changed')), file=csv_f)

    tests = Path('tests')
    df = pd.read_csv('logs/synthetic-samples.csv', dtype=str)
    for i, row in df.iterrows():
        # Get files and check they exist
        try:
            home = tests / row["project"] / str(row["version"])
            transforms_file = next(home.glob('*.transforms.txt'))
            old_file = next(transforms_file.parent.glob('*.c.reformat'))
            new_file = old_file.parent / old_file.stem
            assert transforms_file.exists()
            assert old_file.exists()
            assert new_file.exists()
        except:
            print('Error', row["project"], row["version"])
            raise

        # Collect which transforms were applied
        with open(transforms_file) as f:
            transforms_applied = set(f.read().splitlines())
        was_applied = [t in transforms_applied for t in all_transform_names]

        # Collect number of changed lines
        differences = sum(1 for d in difflib.ndiff(old_file.open().readlines(), new_file.open().readlines()) if d[0] in ('+', '-'))

        print(','.join((row["project"], str(row["version"]), *('TRUE' if a else 'FALSE' for a in was_applied), str(differences))), file=csv_f)

In [19]:
import subprocess
import pandas as pd
from pathlib import Path
import tqdm
import shutil

df = pd.read_csv('logs/synthetic-samples.csv', dtype=str)
# df = pd.read_csv('failed.csv', dtype=str)
tests = Path('tests')
exclude = []
failed = []
for i, row in tqdm.tqdm_notebook(list(df.iterrows())):
    project, version = row["project"], row["version"]
    code_dir = tests / project / version
    assert code_dir.exists()

    refactored = next(code_dir.glob('*.c.reformat'))
    original = refactored.parent / refactored.stem
    backup = refactored.parent / (refactored.stem + '.back')
    assert refactored.exists(), refactored
    assert original.exists(), original
    assert backup.exists(), backup

    # Check return code
    proc = subprocess.run('make clean', cwd=str(code_dir), shell=True)
    proc = subprocess.run('make', cwd=str(code_dir), capture_output=True, shell=True)
    if proc.returncode != 0:
        exclude.append((project, version, proc.stderr.decode()))
        continue
    
    try:
        shutil.copy(refactored, original)
        original.touch()
        proc = subprocess.run('make clean', cwd=str(code_dir), shell=True)
        proc = subprocess.run('make', cwd=str(code_dir), capture_output=True, shell=True)
        if proc.returncode != 0:
            failed.append((project, version, proc.stderr.decode()))
    finally:
        shutil.copy(backup, original)

# install mysql-devel and pam-devel first.
# Some projects in ctestsuite won't build.
print(len(exclude), 'versions didn\'t build')
with open('logs/exclude.csv', 'w') as f:
    f.write(f'project,version\n')
    for project, version, e in exclude:
        f.write(f'{project},{version}\n')
with open('logs/exclude.txt', 'w') as f:
    for project, version, e in exclude:
        f.write(f'***tests/{project}/{version}***\n')
        f.write(f'{e}\n')

print(len(failed), 'versions failed after refactoring')
with open('logs/failed.csv', 'w') as f:
    f.write(f'project,version\n')
    for project, version, e in failed:
        f.write(f'{project},{version}\n')
with open('logs/failed.log', 'w') as f:
    for project, version, e in failed:
        f.write(f'***tests/{project}/{version}***\n')
        f.write(f'{e}\n')

HBox(children=(IntProgress(value=0, max=112), HTML(value='')))




4 versions didn't build
1 versions failed after refactoring


In [None]:
# Save CRLF files
from pathlib import Path
with open('failed-crlf.txt') as f:
    lines = f.readlines()
with open('failed-crlf-real.csv', 'w') as csv_f:
    with open('failed-crlf-real.txt', 'w') as f:
        csv_f.write('project,version\n')
        for l in lines:
            version = l.split('/')[1]
            filename = l.split('/')[-1].strip()
            if int(version) >= 550:
                project = 'abm'
            else:
                project = 'ctestsuite'
            c_file = Path('tests') / project / version / filename
            # print(c_file)
            # assert c_file.exists()
            f.write(f'{c_file}\n')
            csv_f.write(f'{project},{version}\n')

## Test the code
This is an example for how to use c2clib

In [None]:
project = Path('tests/testbed2')
c_file = project / 'testbed2.c'
transforms = [
    insert_noop,
    switch_exchange,
    loop_exchange,
    rename_variable,
    permute_stmt,
]
for t in transforms:
    new_c_file = run('tests/testbed2', c_file, [t], 1, random.choice)
    diff = list(difflib.unified_diff(c_file.open().readlines(), new_c_file.open().readlines()))
    # print(t.__name__)
    print(''.join(diff))

Applied insert_noop
--- 
+++ 
@@ -94,8 +94,8 @@
 
     int s = switchtest(argv[2][0]);
     int refinisher = 123;
-int l = looptest();
+    int l = looptest();
 
-	return s + l;
+    return s + l;
 }
 

Applied switch_exchange
--- 
+++ 
@@ -35,36 +35,22 @@
     char *x;
     int y = 1;
     int z = 0;
-    
-    if (a
- =='z'
-||a
- =='a'
-||a
- =='b'
-)
-    {
+
+    if (a == 'z' || a == 'a' || a == 'b') {
         y = 10;
         if (y == 10 && y > 4 && x == 5) {
             x = "5";
             break;
         }
         y = 3;
-}
-    else if (a
- =='c'
-)
-    {
+    } else if (a == 'c') {
         y --;
         z = 3;
         z += 4;
-}
-    else 
-    {
+    } else {
         x = "1";
         y ++;
         z = 55;
-        
     }
     return strlen(x) * y + z;
 }

Applied loop_exchange
--- 
+++ 
@@ -65,11 +65,10 @@
 {
     int x = 0;
     int i = 0;
-while (i < 10
-){
+    while (i < 10) {
         x += 1;
-    i ++;
-}
+        i++;
+    }
     return x;
 }
 

Applied r