# Coverage tracker

The purpose of this script is to retrieve and format coverage data produced by `afl-cov`.

In [None]:
import re
import os
import glob
import pandas as pd

%run utils.ipynb

### Format the coverage data by regex

In [1]:
'''
Compute function dict
'''
def get_fn_dict(m):
    fn_dict = None
    # list of functions with their starting line
    # FN:<line number of function start>,<function name>
    # the function information should be the same regardless
    if not has_fn_dict and m.match(r'^FN:(\d+),(\w+)'):
        rematch = m.group()
        fn_dict = {m_obj[1]: {'start_line': int(m_obj[0])} for m_obj in rematch}
    return fn_dict

'''
Compute data dict
'''
def get_da_dict(m):
    # list of execution counts  for  each  instrumented  line
    # DA:<line number>,<execution count>[,<checksum>]
    # execution count is computed in a cumulative fashion
    if not m.match(r'^DA:(\d+),(\d+)'):
        print("!DA information missing...")
        return
    
    rematch = m.group()
    this_cov = {int(m_obj[0]): int(m_obj[1]) for m_obj in rematch}
    return this_cov

'''
Future todos: branch coverage information
'''
# branch coverage information is stored which one line per branch
# BRDA:<line number>,<block number>,<branch number>,<taken>

'''
Future todos: function execution counts
'''
# list of execution counts  for  each  instrumented  function, which is stored in variable `fn`
# FNDA:<execution count>,<function name>

'\nFuture todos: function execution counts\n'

### Process and compute the coverage data

In [2]:
'''
Simple regex matcher
'''
class Matcher(object):
    def __init__(self, string):
        self.string = string

    def match(self,regexp):
        self.rematch = re.findall(regexp, self.string, re.MULTILINE)
        return bool(self.rematch)

    def group(self):
        return self.rematch
    
'''
Compute the coverage for info trace
'''
def get_cov_info_trace(info_trace_file):
    cov_str = read_file_as_str(info_trace_file)
    m = Matcher(cov_str)
    return get_da_dict(m), get_fn_dict(m)

'''
Structure the coverage string
'''
def add_da_to_cov(trace_id, m, info_cov):
    if trace_id-1 not in cov and trace_id != 0:
        print("Error: trace_id", trace_id-1, "missing")
        return
    # for trace_id after 0, subtract the preceding trace_id's coverage.
    if trace_id != 0:
        this_cov = {key: cov[trace_id].get(key, 0) - cov[trace_id-1].get(key, 0) for key in cov[trace_id-1]}
    else:
        this_cov = {key: cov[trace_id].get(key, 0) - info_cov.get(key, 0) for key in info_cov}
    print(trace_id, this_cov)
    cov[trace_id] = this_cov

'''
Retrieve trace_id from test_file path
'''
def get_trace_id(test_file):
    trace_id = os.path.basename(test_file)
    i1 = trace_id.index('_', 0)+1
    i2 = trace_id.index('_', i1)
    return int(trace_id[i1:i2])

### Core driver functions

In [None]:
'''
Process code coverage and store the output into cov.csv and exec_c.csv for coverage and execution counts 
["cid", "file", "line", "fn"]
'''
def process_cov(proj_loc, out_loc):
    get_test_cov(proj_loc)
    pd_cov = list(set([line for k, v in cov.items() for line in v.keys()]))
    pd_cov = pd.DataFrame([pd_cov], columns=["cid", "line"])
    display(pd_cov)

'''
Get code coverage from test cases/inputs generated by AFL
'''
def get_test_cov(proj_loc):
    global cov
    lcov_loc = os.path.join(proj_loc, 'isort-seeds-out', 'cov', 'lcov')
    
    # info_cov is used to compute the coverage of the first test case, because the da and fnda values are cumulative
    info_cov = {}
    # for initial info trace file
    info_trace_file = os.path.join(lcov_loc, 'trace.lcov_info_final')
    info_cov, fn_dict = get_cov_info_trace(info_trace_file)
    #print("Initial info trace: {} out of {} lines were covered".format(len([k for k, v in info_cov.items() if v!=0]), len(info_cov)))
    #print("{} functions".format(len(fn_dict)))
    
    test_files = glob.glob(lcov_loc+'/*_trace.lcov_info_final')
    id_dict = {get_trace_id(test_file): test_file for test_file in test_files}
    
    # get all cov first
    for trace_id in sorted(id_dict):
        # todo: capture each end_of_record
        # todo: skip if LH is null, which means there was no coverage hit
        test_file = id_dict[trace_id]
        
        # build matcher object for coverage string
        cov_str = read_file_as_str(test_file)
        m = Matcher(cov_str)
        # compute coverage
        cov[trace_id] = get_da_dict(m)
    
    # sorted reverse, start from the last trace_id, process the coverage backward by subtracting the last trace_id by the before last one to compute the diff
    # ignore last trace_id as it may be incomplete
    for trace_id in sorted(id_dict, reverse=True)[1:]:
        add_da_to_cov(trace_id, m, info_cov)
        #print("Test case {}: {}/{}".format(trace_id, len([k for k, v in cov[trace_id].items() if v!=0]), len(cov[trace_id])))