In [2]:
import re


# read a file and extract only the lines which match a regex
def remove_lines_with_pattern(input_file, output_file, pattern):
    with open(input_file, 'r') as file:
        lines = file.readlines()
    
    with open(output_file, 'w') as file:
        # file.write("eip,vpn,reuse\n")
        file.write("eip,vpn,reuse,prediction\n")
        for line in lines:
            if re.match(pattern, line):
                file.write(line)



pattern = r"([0-9]+,){3}[0-9]*"

# remove_lines_with_pattern("pref-off.txt", "off.csv", pattern)
# remove_lines_with_pattern("pref-on.out", "on.csv", pattern)
remove_lines_with_pattern("bug_fix_pref_on.txt", "bf_on.csv", pattern)


In [3]:
import pandas as pd

off_df = pd.read_csv("off.csv")
on_df = pd.read_csv("on.csv")
bf_on_df = pd.read_csv("bf_on.csv")


In [4]:
print("Off size:", off_df.shape)
print("On size:", on_df.shape)
print("Bug Fix On size:", bf_on_df.shape)

Off size: (3603382, 3)
On size: (3660403, 3)
Bug Fix On size: (3661300, 4)


In [7]:
# print reuse distribution for on and off for partitions 0, 1-5, 6-10, 11-20, >20
def reuse_distribution_values(df):
    reuse = df['reuse']
    reuse_0 = reuse[reuse == 0].shape[0]
    reuse_1_5 = reuse[(reuse > 0) & (reuse <= 5)].shape[0]
    reuse_6_10 = reuse[(reuse > 5) & (reuse <= 10)].shape[0]
    reuse_11_20 = reuse[(reuse > 10) & (reuse <= 20)].shape[0]
    reuse_gt_20 = reuse[reuse > 20].shape[0]

    return [reuse_0, reuse_1_5, reuse_6_10, reuse_11_20, reuse_gt_20]


pref_off = reuse_distribution_values(off_df)
pref_on = reuse_distribution_values(on_df)
bf_on_df_dist = reuse_distribution_values(bf_on_df)

print("Off:", pref_off)
print("On:", pref_on)
print("Bug fix On:", bf_on_df_dist)

Off: [3302718, 223082, 49644, 14407, 13531]
On: [3339414, 242589, 51552, 14268, 12580]
Bug fix On: [3340300, 243568, 51395, 14070, 11967]


In [18]:
# create a percentage wise distribution of reuse for partitions 0, 1-5, 6-10, 11-20, >20
def reuse_distribution(df):
    reuse = df['reuse']
    reuse_0 = reuse[reuse == 0].shape[0]
    reuse_1_5 = reuse[(reuse > 0) & (reuse <= 5)].shape[0]
    reuse_6_10 = reuse[(reuse > 5) & (reuse <= 10)].shape[0]
    reuse_11_20 = reuse[(reuse > 10) & (reuse <= 20)].shape[0]
    reuse_gt_20 = reuse[reuse > 20].shape[0]

    total = df.shape[0]
    return (reuse_0/total, reuse_1_5/total, reuse_6_10/total, reuse_11_20/total, reuse_gt_20/total)


# print reuse distribution for on and off in proper format
off_dist = reuse_distribution(off_df)
on_dist = reuse_distribution(on_df)

print("Off distribution: 0: {:.2f}, 1-5: {:.2f}, 6-10: {:.2f}, 11-20: {:.2f}, >20: {:.2f}".format(*off_dist))
print("On distribution: 0: {:.2f}, 1-5: {:.2f}, 6-10: {:.2f}, 11-20: {:.2f}, >20: {:.2f}".format(*on_dist))

Off distribution: 0: 0.92, 1-5: 0.06, 6-10: 0.01, 11-20: 0.00, >20: 0.00
On distribution: 0: 0.91, 1-5: 0.07, 6-10: 0.01, 11-20: 0.00, >20: 0.00


In [None]:
# plot seprate stacked bar charts for off_dist and on_dist showing distribution of reuse
import matplotlib.pyplot as plt

labels = ['0', '1-5', '6-10', '11-20', '>20']
x = range(len(labels))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x, off_dist, width, label='Off')
rects2 = ax.bar([i + width for i in x], on_dist, width, label='On')

ax.set_ylabel('Percentage')
ax.set_title('Reuse distribution')
ax.set_xticks([i + width/2 for i in x])
ax.set_xticklabels(labels)

ax.legend()
plt.show()


In [22]:
# Show the max and min reuse values for on and off
print("Max reuse off:", off_df['reuse'].max())
print("Min reuse off:", off_df['reuse'].min())

print("Max reuse on:", on_df['reuse'].max())
print("Min reuse on:", on_df['reuse'].min())

Max reuse off: 1032
Min reuse off: 0
Max reuse on: 1009
Min reuse on: 0
