# Generating refactored corpus
Modify the synthetic benchmarks to generate a corpus of refactored programs.
Generate 1 refactored program for each benchmark version.

In [6]:
from importnb import Notebook
import importlib
import transformations
importlib.reload(transformations)

<module 'transformations' from '/mnt/Hydrogen_Volume/benjis/work/transform/transformations.py'>

In [7]:
from pathlib import Path
import difflib
import random
from lxml import etree as et
import pandas as pd
import shutil
import subprocess
import random
import copy
import traceback
import datetime

In [8]:
def reformat(old_lines, new_file):
    import difflib
    differ = difflib.Differ()
    with open(new_file) as f:
        new_lines = f.readlines()
    diffs = differ.compare(old_lines, new_lines)
    r = []
    line_nums = []
    lineno = 0
    for line in diffs:
        if line[0] in (' ', '+'):
            lineno += 1
        if line[0] == '+':
            if len(r) == 0 or r[-1] == lineno-1:
                r.append(lineno)
            else:
                line_nums.append(r)
                r = [lineno]
    line_nums.append(r)
    if len(line_nums) == 0:
        return old_lines
    lines = []
    for r in line_nums:
        if len(r) == 0:
            break
        if len(r) == 1:
            r = [r[0], r[0]]
        lines.append(f'--lines={r[0]}:{r[-1]}')
    style='-style="{BasedOnStyle: llvm, IndentWidth: 4}"'
    proc = subprocess.run(f'clang-format {" ".join(lines)} {new_file} -i {style}', shell=True, capture_output=True)
    if proc.returncode != 0:
        print(proc.stderr.decode())
    return new_file
    
def c2c(project, c_filename, transforms, picker, num_iterations):
    """Do C source-to-source translation"""
    transforms = copy.deepcopy(transforms)
    info = {"project": str(project)}
    transform_filename = c_filename.parent / (c_filename.stem + '.transforms.txt')
    tmp_dir = Path('c2c')
    tmp_dir.mkdir(exist_ok=True)
    tmp_project = tmp_dir / project.name
    if tmp_project.exists():
        shutil.rmtree(tmp_project)
    shutil.copytree(project, tmp_project)
    tmp_c_filename = tmp_project / c_filename.relative_to(project)
    shutil.copy(c_filename, tmp_c_filename)

    transforms_applied = []
    def log_transforms_applied():
        """Log after each transform, for debugging in case the procedure is errored."""
        with open(transform_filename, 'w') as f:
            for t in transforms_applied:
                f.write(f'{t.__name__}\n')

    # Apply num_iterations transforms, chosen randomly
    # i = 0
    # while i < num_iterations:
    for i in range(len(transforms)):
        if len(transforms) == 0:
            print('Quitting early, ran out of transforms')
            break
        # t = random.choice(transforms)
        t = transforms[0]
        try:
            if t == switch_exchange or t == loop_exchange:
                with open(tmp_c_filename) as f:
                    old_lines = f.readlines()

            new_lines = t(tmp_c_filename, picker=picker, info=info)
            if new_lines == None:
                print(f'Could not apply {t.__name__}.')
                transforms.remove(t)
                continue
            with open(tmp_c_filename, 'w') as f:
                f.writelines(new_lines)
            i += 1
            print('Applied', t.__name__)
            transforms.remove(t)
            transforms_applied.append(t)
            log_transforms_applied()

            if t == switch_exchange or t == loop_exchange:
                tmp_c_filename = reformat(old_lines, tmp_c_filename)
        except Exception as e:
            print(f'Error applying {t.__name__}: {e}. Stack trace written to errors.log.')
            with open('errors.log', 'a') as f:
                print(f'***Exception {project} {c_filename} {t.__name__} ({datetime.datetime.now()})***', e, file=f)
                print(traceback.format_exc(), file=f)
            transforms.remove(t)
    return Path(tmp_c_filename)

if __name__ == '__main__':
    c_file = Path('tests/testbed/testbed.c')
    new_filename = c2c(c_file.parent, c_file, [
        transformations.insert_noop,
        transformations.switch_exchange,
        transformations.loop_exchange,
        transformations.rename_variable,
        transformations.permute_stmt,
    ], lambda a: a[0], 5)
    print(new_filename)

Applied insert_noop
Applied switch_exchange
Applied loop_exchange
Applied rename_variable
Applied permute_stmt
c2c/testbed/testbed.c


## Begin experiment

Subject benchmarks:
- ABM
- C Test Suite

Hyperparameters:
- random seed
- num_iterations: Number of transformations to do
- transforms:


In [9]:
random.seed(0)
importlib.reload(transformations)

# Experiment parameters
def picker(collection):
    assert len(collection) > 0, 'Collection is empty'
    return random.choice(collection)
num_iterations = None
transforms = [
    transformations.insert_noop,
    transformations.switch_exchange,
    transformations.loop_exchange,
    transformations.rename_variable,
    transformations.permute_stmt,
]
def run_exp(project, c_file):
    c_file = Path(c_file)
    return c2c(project, c_file, transforms=transforms, num_iterations=num_iterations, picker=picker)

# This file contains all the buggy versions from the synthetic benchmarks with the format "project-version".
# One sample per line.
df = pd.read_csv('synthetic-samples.csv', dtype=str)
samples = list(zip(df["project"], df["version"]))
tests = Path('tests')
all_projects = [tests / p / v for p,v in samples]

for project in all_projects:
# for project in all_projects:
    print(project)
    assert project.exists()
    c_files = list(project.glob('*.c'))
    c_files = [c for c in c_files if not c.name.endswith('.formatted.c') and not c.name.endswith('.new.c')]
    assert len(c_files) >= 1, f'No C files found in {project}'

    if len(c_files) == 1:
        c_file = c_files[0]
    elif len(c_files) > 1:
        for fpath in c_files:
            with fpath.open() as f:
                text = f.read()
            if '/* BAD */' in text or '/* FLAW */' in text:
                c_file = fpath
    tmp_c_file = run_exp(project, c_file)
    new_c_file = Path(str(c_file) + '.reformat')
    shutil.copy(tmp_c_file, new_c_file)
    diff = list(difflib.unified_diff(c_file.open().readlines(), new_c_file.open().readlines(), fromfile=c_file.name, tofile=c_file.name))
    with open(str(c_file) + '.diff', 'w') as f:
        f.write(''.join(diff))

tests/abm/550
Applied insert_noop
Could not apply switch_exchange.
Could not apply loop_exchange.
Applied rename_variable
Could not apply permute_stmt.
tests/abm/557
Applied insert_noop
Could not apply switch_exchange.
Could not apply loop_exchange.
Applied rename_variable
Could not apply permute_stmt.
tests/abm/575
Applied insert_noop
Could not apply switch_exchange.
Applied loop_exchange
Applied rename_variable
Applied permute_stmt
tests/abm/577
Applied insert_noop
Could not apply switch_exchange.
Applied loop_exchange
Applied rename_variable
Applied permute_stmt
tests/abm/592
Applied insert_noop
Could not apply switch_exchange.
Could not apply loop_exchange.
Applied rename_variable
Applied permute_stmt
tests/abm/594
Applied insert_noop
Could not apply switch_exchange.
Could not apply loop_exchange.
Error applying rename_variable: No variable reference queried. Stack trace written to errors.log.
Applied permute_stmt
tests/abm/598
Applied insert_noop
Could not apply switch_exchange.
C

In [10]:
import pandas as pd
import difflib

all_transform_names = sorted([t.__name__ for t in transforms])

print(','.join(('project', 'version', *all_transform_names, 'lines changed')))

tests = Path('tests')
df = pd.read_csv('synthetic-samples-ctestsuite.csv')
for i, row in df.iterrows():
    # Get files and check they exist
    home = tests / row["project"] / str(row["version"])
    transforms_file = next(home.glob('*.transforms.txt'))
    backup_file = next(home.glob('*.c.back'))
    c_file = backup_file.parent / (backup_file.stem + '.formatted')
    assert transforms_file.exists()
    assert backup_file.exists()
    assert c_file.exists()

    # Collect which transforms were applied
    with open(transforms_file) as f:
        transforms_applied = set(f.read().splitlines())
    was_applied = [t in transforms_applied for t in all_transform_names]

    # Collect number of changed lines
    differences = sum(1 for d in difflib.ndiff(backup_file.open().readlines(), c_file.open().readlines()) if d[0] in ('+', '-'))

    print(','.join((row["project"], str(row["version"]), *('TRUE' if a else 'FALSE' for a in was_applied), str(differences))))

project,version,insert_noop,loop_exchange,permute_stmt,rename_variable,switch_exchange,lines changed


StopIteration: 

## Test the code
This is an example for how to use c2clib

In [None]:
project = Path('tests/testbed2')
c_file = project / 'testbed2.c'
transforms = [
    insert_noop,
    switch_exchange,
    loop_exchange,
    rename_variable,
    permute_stmt,
]
for t in transforms:
    new_c_file = run('tests/testbed2', c_file, [t], 1, random.choice)
    diff = list(difflib.unified_diff(c_file.open().readlines(), new_c_file.open().readlines()))
    # print(t.__name__)
    print(''.join(diff))

Applied insert_noop
--- 
+++ 
@@ -94,8 +94,8 @@
 
     int s = switchtest(argv[2][0]);
     int refinisher = 123;
-int l = looptest();
+    int l = looptest();
 
-	return s + l;
+    return s + l;
 }
 

Applied switch_exchange
--- 
+++ 
@@ -35,36 +35,22 @@
     char *x;
     int y = 1;
     int z = 0;
-    
-    if (a
- =='z'
-||a
- =='a'
-||a
- =='b'
-)
-    {
+
+    if (a == 'z' || a == 'a' || a == 'b') {
         y = 10;
         if (y == 10 && y > 4 && x == 5) {
             x = "5";
             break;
         }
         y = 3;
-}
-    else if (a
- =='c'
-)
-    {
+    } else if (a == 'c') {
         y --;
         z = 3;
         z += 4;
-}
-    else 
-    {
+    } else {
         x = "1";
         y ++;
         z = 55;
-        
     }
     return strlen(x) * y + z;
 }

Applied loop_exchange
--- 
+++ 
@@ -65,11 +65,10 @@
 {
     int x = 0;
     int i = 0;
-while (i < 10
-){
+    while (i < 10) {
         x += 1;
-    i ++;
-}
+        i++;
+    }
     return x;
 }
 

Applied r