# Coverage Monitor

Problem statement

Approach

Who are we?

## Constants

In [1]:
import os

# benchmark directory
benchmark_loc = "/Users/IceCream/Documents/hw22_fuzzing/benchmark"

# studied project
proj = "libpng-1.6.37-setting7"
proj_loc = os.path.join(benchmark_loc, proj)

# output directory
out_loc = os.path.join(proj_loc, "analysis")
if not os.path.exists(out_loc):
    os.makedirs(out_loc)

## Lauching AFL

In [None]:
# afl driver goes here

## Monitoring executions

#### 1. Collecting coverage

In [45]:
%run cov_tracker.ipynb

import time

'''
Process code coverage and store the output into cov.csv and exec_c.csv for coverage and execution counts 
["cid", "file", "line", "fn"]
'''
def process_cov(proj_loc, out_loc):
    global df_exec_c, fn_dict
    
    raw_data_loc = os.path.join(out_loc, 'raw')
    spectra_csv = os.path.join(raw_data_loc, 'spectra_csv')
    exec_c_csv = os.path.join(raw_data_loc, 'exec_c.csv')
    if not os.path.exists(raw_data_loc):
        os.makedirs(raw_data_loc)
    
    if not os.path.exists(exec_c_csv) or not os.path.exists(spectra_csv):
        # collect the code coverage and store it in `df_exec_c`
        print("Collecting coverage results...")
        collect_cov(proj_loc)
        # preprocess the data before computing the individual count
        print("Preprocessing coverage...")
        df_spectra, df_exec_c = preprocess_cov(spectra_csv, exec_c_csv)
    else:
        df_spectra = pd.read_csv(spectra_csv)
        df_exec_c = pd.read_csv(exec_c_csv)
    
    # from cumulative count to individual count
    print("Computing execution count...")
    print("exec_c {} & spectra {}".format(df_exec_c.shape, df_spectra.shape))
    # sorted reverse, start from the last trace_id
    # sampling
    # df_exec_c = df_exec_c[df_exec_c['trace_id']>210]
    df_exec_c = df_exec_c.sort_values(by=['trace_id'], ascending=False)
    df_exec_c = df_exec_c.apply(lambda row: diff_cov(row, df_exec_c), axis=1)
    df_exec_c = df_exec_c[df_exec_c['exec'] != 0]

    return df_spectra, df_exec_c

# set global df_exec_c to store raw coverage data
counter = 0
verbose = True
df_exec_c = []

start = time.time()
df_spectra, df_exec_c = process_cov(proj_loc, out_loc)

# output 1: the coverage spectra [line, fn, file] covered by tests
spectra_csv = os.path.join(out_loc, 'spectra.csv')
print("spectra {}".format(df_spectra.shape))
df_spectra.to_csv(spectra_csv)

# output 2: the execution count on each statement `cid` for each trace id [trace_id, cid, exec]
exec_csv = os.path.join(out_loc, 'execution.csv')
print("exec_c {} ".format(df_exec_c.shape))
df_exec_c.to_csv(exec_csv, index=False)

end = time.time()
print(end - start)

Computing execution count...
exec_c (534184, 3) & spectra (2405, 4)
counter at 1000
counter at 2000
counter at 3000
counter at 4000
counter at 5000
counter at 6000
counter at 7000
counter at 8000
counter at 9000
counter at 10000
counter at 11000
counter at 12000
counter at 13000
counter at 14000
counter at 15000
counter at 16000
counter at 17000
counter at 18000
counter at 19000
counter at 20000
counter at 21000
counter at 22000
counter at 23000
counter at 24000
counter at 25000
counter at 26000
counter at 27000
counter at 28000
counter at 29000
counter at 30000
counter at 31000
spectra (2405, 4)
exec_c (25569, 3) 
19.864296913146973


#### 2. Computing response time

This time is based on the creation time of the coverage file

In [46]:
%run cov_tracker.ipynb

import os
import glob
import platform
import datetime

def get_response_time(proj_loc):
    print("Computing the response time...")
    lcov_loc = os.path.join(proj_loc, 'seeds_out', 'cov', 'lcov')
    # get list of all coverage files
    test_files = glob.glob(lcov_loc+'/*_trace.lcov_info_final')
    id_dict = {get_trace_id(test_file): test_file for test_file in test_files}
    
    # info_final
    id_dict[-1] = os.path.join(lcov_loc, 'trace.lcov_info_final')
    # base lcov
    id_dict[-2] = os.path.join(lcov_loc, 'trace.lcov_base')

    # collect creation time for all other trace_id
    # 1 to skip the last one which might contain no instrumentation when the fuzzing run is manually interrupted
    c_time = {}
    for trace_id in sorted(id_dict, reverse=True)[1:]:
        stat = os.stat(id_dict[trace_id])
        if platform.system() == "Darwin":
            c_timestamp = stat.st_birthtime
        elif platform.system() == "Windows":
            # this might contain bugs, never tested
            c_timestamp = os.path.getmtime(id_dict[trace_id])
        else:
            c_timestamp = stat.st_ctime
        c_time[trace_id] = datetime.datetime.fromtimestamp(c_timestamp)
        #print(trace_id, c_time[trace_id])
        
    # compute difference in creation time as the response time
    c_time = {i: (time-c_time[i-1]).total_seconds() for i, time in c_time.items() if i >= -1}
    df_c_time = pd.DataFrame(list(c_time.items()), columns=['trace_id', 'response_time'])
    
    return df_c_time

df_c_time = get_response_time(proj_loc)

# output: response time based on the creation time of the coverage file
response_time_csv = os.path.join(out_loc, 'response_time.csv')
df_c_time.to_csv(response_time_csv, index=False)

Computing the response time...


#### 3. Looking for patterns

In [47]:
import numpy as np

'''
This function groups the tests with similar data flow patterns
'''
def group_tests():
    return

'''
This function looks for statements that are input sensitive.
'''
def input_sensitive(row):
    
    return

def get_preprocessed_data(out_loc):
    df_spectra = pd.read_csv(os.path.join(out_loc, 'spectra.csv'))
    df_exec_c = pd.read_csv(os.path.join(out_loc, 'execution.csv'))
    df_response_time = pd.read_csv(os.path.join(out_loc, 'response_time.csv'))
    return df_spectra, df_exec_c, df_response_time

df_spectra, df_exec_c, df_response_time = get_preprocessed_data(out_loc)

df_cid_executed = df_exec_c.groupby('cid')['exec'].apply(lambda row: list(np.unique(row))).reset_index(name='exec_c_list')
display(df_cid_executed)
cid_executed_csv = os.path.join(out_loc, 'cid_executed.csv')
df_cid_executed.to_csv(cid_executed_csv, index=False)
# df_exec_c['input_sensitive'] = df_exec_c.apply(lambda row: input_sensitive(row))
# likely to be changed by more tests, but less execution counts
# likely to be changed by less tests, but more execution counts

Unnamed: 0,cid,exec_c_list
0,0,"[3, 645]"
1,1,"[3, 645]"
2,2,"[3, 645]"
3,3,"[3, 645]"
4,4,"[3, 645]"
...,...,...
2400,2400,"[189, 192, 24804]"
2401,2401,"[189, 192, 24804]"
2402,2402,"[189, 192, 24804]"
2403,2403,"[189, 192, 24804]"
