# Plotting
This file contains all the code useful to data analysis.
Some cells may take a 15 minutes to process. Likely long processes will be indicated with #long.
Read comment for parts adapted to your outputs.

Below is the cell deemed necessary for most sections. Each section is to be run sequentially.

In [1]:
#imports 
import os
import csv
import statistics, re
import pandas as pd
import seaborn as sns
from statistics import mean
import matplotlib.pyplot as plt
import numpy as np
# useful functions for all data extraction
def line_style(name, exponent):
    if int(name[9:-19])==1:
        style='-'
    elif int(name[9:-19])==5:
        style='--'
    elif int(name[9:-19])==10:
        style='-.'
    elif int(name[9:-19])==50:
        style=':'
    if exponent>=10: #coloring of the curves
        color='violet'
    elif exponent>=9:
        color='royalblue'
    elif exponent>=8:
        color='cyan'
    elif exponent>=7:
        color='turquoise'
    elif exponent>=6:
        color='green'
    elif exponent>=5:
        color='yellowgreen'
    elif exponent>=4:
        color='gold'
    elif exponent>=3:
        color='orange'
    elif exponent>=2:
        color='chocolate'
    else:
        color='red'
    return style, color
#processing output from file to pandas dataframe
def process_output(output_filename, run_output_filename, model_filename):
    def add_names(row, models):
        key = str(int(row['model_id']))
        name = models[key]['name']
        if name == 'Mondrian':
            return name + ' ' + models[key]['tree_count'] + ' tree(s) (RAM x' + str(int(models[key]['memory_size']) / 600000) + ')'
        elif name == 'MCNN':
            if models[key]['cleaning'] == '1':
                return 'MCNN Origin ' + models[key]['cluster_count'] + ' clusters'
            elif models[key]['cleaning'] == '2':
                return 'MCNN Mixe ' + models[key]['cluster_count'] + ' clusters'
            else:
                return 'MCNN OrpailleCC ' + models[key]['cluster_count'] + ' clusters'
        elif name == 'StreamDM HoeffdingTree':
            return name
        else:
            return models[key]['fullname']
    def add_files(row, models):
        return models[str(int(row['model_id']))]['file']
    def add_color(row, models):
        return models[str(int(row['model_id']))]['color']
    def add_library(row, models):
        if models[str(int(row['model_id']))]['fullname'].find('StreamDM') >= 0:
            return 'StreamDM'
        return 'OrpailleCC'
    def add_algorithm(row, models):
        if models[str(int(row['model_id']))]['fullname'].find('Naive') >= 0:
            return 'NaiveBaye'
        if models[str(int(row['model_id']))]['fullname'].find('Naive') >= 0:
            return 'NaiveBaye'
        if models[str(int(row['model_id']))]['fullname'].find('Mondrian') >= 0:
            return 'Mondrian Forest'
        if models[str(int(row['model_id']))]['fullname'].find('MCNN') >= 0:
            return 'MCNN'
        if models[str(int(row['model_id']))]['fullname'].find('MLP') >= 0:
            return 'MLP'
        if models[str(int(row['model_id']))]['fullname'].find('FNN') >= 0:
            return 'FNN'
        return 'Unknown'

    models = read_models(model_filename)
    output = pd.read_csv(output_filename)
    output_runs = pd.read_csv(run_output_filename)
    output.columns = ['model_id', 'run_id', 'element_count', 'seed', 'accuracy', 'f1', 'memory']
    output_runs.columns = ['model_id', 'run_id', 'time', 'energy', 'power']
    output['fullname'] = output.apply(lambda r: add_names(r, models), axis=1)
    output['file'] = output.apply(lambda r: add_files(r, models), axis=1)
    #print("Adding to output run")
    output_runs['fullname'] = output_runs.apply(lambda r: add_names(r, models), axis=1)
    output_runs['file'] = output_runs.apply(lambda r: add_files(r, models), axis=1)
    output_runs['color'] = output_runs.apply(lambda r: add_color(r, models), axis=1)
    output_runs['library'] = output_runs.apply(lambda r: add_library(r, models), axis=1)
    output_runs['algorithm'] = output_runs.apply(lambda r: add_algorithm(r, models), axis=1)
    return (output, output_runs, models)
#recognize the model, more could be added besides Mondrian trees.
def read_models(filename):
    models = {}
    model_file = open(filename, "r")
    csv_structure = csv.reader(model_file)
    for row in csv_structure:
        color = hashStringToColor(row[1] + "".join(row[3:]))
        models[row[0]] = {"name": row[1], "file": row[2], "color": color}
        if row[1] == 'Mondrian':
            models[row[0]]["lifetime"] = row[3]
            models[row[0]]["base"] = row[4]
            models[row[0]]["discount"] = row[5]
            models[row[0]]["tree_count"] = row[6]
            models[row[0]]["memory_size"] = row[7]
            models[row[0]]["fullname"] = "Mondrian T" + row[1][row[1].find("Mondrian")+8:] + " " + row[3] + "-" + row[4] + "-" + row[5]
        elif row[1] == "FNN":
            models[row[0]]["fullname"] = "FNN"
        else:
            models[row[0]]["fullname"] = models[row[0]]["name"]
    return models
def stdev(l):
    if len(l) <= 1:
        return 0.0
    return statistics.stdev(l)
def hashStringToColor(string):
    hsh = hash(string)
    r = (hsh & 0xFF0000) >> 16
    g = (hsh & 0x00FF00) >> 8
    b = hsh & 0x0000FF
    return "#" + format(r, "02x") + format(g, "02x") + format(b, "02x")

## Data extraction for paper
The paper is available at https://www.overleaf.com/read/rtvpkqksbqxj. All data extracted is obtained using the following code.

## 1. Figure 1.  Difference between the F1 score obtained at precision i and the F1 score of the same model at double precision (52 bits) with  3.0  MB  of  memory.
Run all cells. Adapt the first lines to the number of trees you want to plot.

In [7]:
# Choosing the trees to plot.
tr = {1:'-',5:'--',10:'-.',50:':'}
number_of_trees = tr[5] # Number of trees here.

# Function
def print_dif(output, output_runs, models, axs):
    def add_key(key):
        name = models[0][key]['name']
        if name == 'Mondrian':
            ram_count = str(int(models[0][key]['memory_size']) / 600000)
            return (name + ' ' + models[0][key]['tree_count'] + ' tree(s) (RAM x' + ram_count + ')', key, (name + ' (RAM x' + ram_count + ')', int(models[0][key]['tree_count'])))
        else:
            return (name, key, (name, 0))
    knn_offline_f1 = {'banos_6': 0.86, 'recofit_6': 0.40}
    #(print name, key in models, tuple for sorting)
    keys = [add_key(key) for key in models[0] if models[0][key]['fullname'] != 'Previous']
    #grounp the third and second value to use dict to do a unique
    keys = dict([(key[0], (key[1], key[2])) for key in keys]).items()
    #Unpack the value part to separate the key and the sorting tuple
    keys = [(key[0], key[1][0], key[1][1]) for key in keys]
    keys = sorted(keys, key = lambda x: x[2])
    names = [key[0] for key in keys]
    plt.rcParams.update({'font.size': 33})
    list_datasets = ['banos_6','recofit_6'] # modifity for your dataset names, if testing with other datasets.
    for dat, dataset_name in enumerate(list_datasets):
        print('Dataset: ' + dataset_name)
        daty, daty_std = [], []
        for i, precision in enumerate(PRECISIONS):
            datytmp = output[i][output[i].file.str.contains(dataset_name)]
            daty_stdtmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).std().reset_index()
            datytmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).mean().reset_index()
            daty.append(datytmp)
            daty_std.append(daty_stdtmp)
        #F1 Score DIF
        axs[dat].plot([-100000,100000],[0.05,0.05], color='darkgrey',linewidth=2.0,linestyle='--')
        axs[dat].plot([-100000,100000],[-0.05,-0.05], color='darkgrey',linewidth=2.0,linestyle='--')
        for name in names:
            if name[-4:]=='5.0)':
                for i, precision in enumerate(PRECISIONS):
                    style, color = line_style(name, precision)
                    if style==number_of_trees: # number of trees to plot
                        if i<11:
                            y1 = daty[i][daty[i].fullname == name]['f1']-daty[-1][daty[-1].fullname == name]['f1'] - daty_std[i][daty_std[i].fullname == name]['f1']
                            y2 = daty[i][daty[i].fullname == name]['f1']-daty[-1][daty[-1].fullname == name]['f1'] + daty_std[i][daty_std[i].fullname == name]['f1']
                            axs[dat].fill_between(daty_std[i][daty_std[i].fullname == name]['element_count'], y1, y2, color=color, linestyle=style, alpha=0.05)
                        axs[dat].plot(daty[i][daty[i].fullname == name]['element_count'],daty[i][daty[i].fullname == name]['f1']-daty[-1][daty[-1].fullname == name]['f1'], markevery=0.1, markersize=15, color=color,linewidth=2.0)

In [8]:
#long
#Data extraction, putting results in lists.
dir_name = 'node_results'
exp = '11'
output_lst, output_run_lst, models_lst = [], [], []
PRECISIONS = range(1,52)
for i in PRECISIONS:
    output, output_runs, models = process_output("{}/{}_output_{}".format(dir_name,exp,i),"{}/{}_output_runs_{}".format(dir_name,exp,i),"{}/{}_models_{}.csv".format(dir_name,exp,i))
    output_lst.append(output)
    output_run_lst.append(output_runs)
    models_lst.append(models)
dir_name = 'whole_results'
output_lst2, output_run_lst2, models_lst2= [], [], []
PRECISIONS = range(1,52)
for i in PRECISIONS:
    output, output_runs, models = process_output("{}/{}_output_{}".format(dir_name,exp,i),"{}/{}_output_runs_{}".format(dir_name,exp,i),"{}/{}_models_{}.csv".format(dir_name,exp,i))
    output_lst2.append(output)
    output_run_lst2.append(output_runs)
    models_lst2.append(models)

In [9]:
# Plotting
plt.rcParams.update({'font.size': 24})
fig = plt.figure(figsize=(30, 20))
gs = fig.add_gridspec(2, 2, hspace=0.05, wspace=0.05)
(ax1, ax2), (ax3, ax4) = gs.subplots(sharex='col', sharey='row')
PRECISIONS = range(1,52)
print_dif(output_lst, output_run_lst, models_lst, [ax1,ax2])
PRECISIONS = range(1,52)
print_dif(output_lst2, output_run_lst2, models_lst2, [ax3,ax4])
ticks = np.linspace(0,1,5)
for ax in [ax1,ax2,ax3,ax4]: #uniform axes
    ax.set(xlabel='% of dataset', ylabel='Change in Performance (\u0394 F1)')
    ax.set_ylim([-0.15, 0.08])
    if ax==ax1 or ax==ax3:
        ax.set_xticks(14200*ticks) # values
        ax.set_xlim([-14400*0.03, 14400+14400*0.03])
    else:
        ax.set_xticks(84800*ticks)
        ax.set_xlim([-84800*0.03, 84800+84800*0.03])
    ax.label_outer()
plt.savefig("difplot.png") #Name of the plot.
plt.clf()

Dataset: banos_6
Dataset: recofit_6
Dataset: banos_6
Dataset: recofit_6


<Figure size 2160x1440 with 0 Axes>

## 2. Figure 2. F1 Score of the uninstrumented classifiers with 0.6 MB and 1.2 MB, NI and WI classifiers at 4 bit exponent and 3 bitprecision (8 bits) with 1.2 MB.
Run all cells.

In [10]:
#long
#Array generation
model=[['node',0,52,11],['node',1,52,11],['node',1,3,4],['whole',1,3,4]]
f1 = np.zeros((len(model), 98)) # Number of classifiers in models.csv
for i, m in enumerate(model):
    file1 = np.genfromtxt('{}_results/{}_output_{}'.format(m[0], m[-1], m[-2]), delimiter=',', dtype=np.float)
    for atts in file1:
        if (atts[2]==84800 and atts[0]>=84) or (atts[2]==14200 and atts[0]<84): #difference in datasets
            f1[i,int(atts[0])]+= atts[-2]
f1/=30 #averaging
arr = np.zeros((4,8))
for i,m in enumerate(model):
    for j, k in enumerate(range(4*m[1],4*(m[1]+1))):
        arr[i][j]=f1[i][:12][k]
        arr[i][j+4]=f1[i][84:96][k]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  file1 = np.genfromtxt('{}_results/{}_output_{}'.format(m[0], m[-1], m[-2]), delimiter=',', dtype=np.float)


In [11]:
# Plotting
plt.rcParams.update({'font.size': 24})
fig = plt.figure(figsize=(30, 12))
gs = fig.add_gridspec(1,2, hspace=0.05, wspace=0.05)
(ax1, ax2) = gs.subplots(sharey='row')
for i, dataset,ax in zip(range(2),['banos', 'recofit'],(ax1,ax2)):
    x = range(1,5)
    ax.plot(x, arr[0][4*i:4*i+4], c="red", linestyle='--', marker='o', markersize=4, label="Uninstrumented, 0.6 MB")
    ax.plot(x, arr[1][4*i:4*i+4], c="red", marker='o', markersize=4, label="Uninstrumented, 1.2 MB")
    ax.plot(x, arr[2][4*i:4*i+4], c="blue", marker='o', markersize=4, label="Node, 1.2 MB")
    ax.plot(x, arr[3][4*i:4*i+4], c="green", marker='o', markersize=4, label="Whole, 1.2 MB")
    ax.set_xticks(x) # values
    ax.set_xticklabels([1,5,10,50]) # labels
    
for ax in [ax1,ax2]:
    ax.set(xlabel='Number of trees', ylabel='F1')
    ax.set_ylim([0, 0.65])
    ax.grid()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.label_outer()
plt.savefig("gain.png")
plt.clf()

<Figure size 2160x864 with 0 Axes>

## 3. TABLE 2: F1 score differences with reduced exponent lengths
Run all cells. This is meant for Latex input, observe arr if necessary.

In [7]:
# Exponent analysis table generation, node.
precisions=[52]
exponents= [2,3,4,5,11]
f1 = np.zeros((len(precisions), len(exponents), 98)) # Number of classifiers in models.csv
for instr in ['node']:#['node', 'whole']:
    for pi, p in enumerate(precisions):
        for ei, e in enumerate(exponents):
            file1 = np.genfromtxt('{}_results/{}_output_{}'.format(instr, e, p), delimiter=',', dtype=float)
            for atts in file1:
                if (atts[2]==84800 and atts[0]>=84) or (atts[2]==14200 and atts[0]<84): #difference in datasets
                    f1[pi,ei,int(atts[0])]+= atts[-2]
f1/=30
arr = np.zeros((2,3,4,5))
for k in range(12):
    for e, b in enumerate(f1):
        for p, a in enumerate(b):
            arr[0][k//4][k%4][p]=(a[:12]-f1[-1][-1][:12])[k]*100
            arr[1][k//4][k%4][p]=(a[84:96]-f1[-1][-1][84:96])[k]*100
for i in range(2):
    for j in [0,2]:
        for k in range(4):
            print("&\cTab{{{:.2f}}}&\cTab{{{:.2f}}}&\cTab{{{:.2f}}}&\cTab{{{:.2f}}}".format(arr[i][j][k][0],arr[i][j][k][1],arr[i][j][k][2],arr[i][j][k][3]))

&\cTab{-43.21}&\cTab{-4.46}&\cTab{-0.00}&\cTab{-0.00}
&\cTab{-53.74}&\cTab{-1.57}&\cTab{0.00}&\cTab{0.00}
&\cTab{-53.44}&\cTab{-1.73}&\cTab{0.00}&\cTab{0.00}
&\cTab{-39.71}&\cTab{-0.47}&\cTab{0.00}&\cTab{0.00}
&\cTab{-46.28}&\cTab{-8.78}&\cTab{0.00}&\cTab{0.00}
&\cTab{-63.34}&\cTab{-4.04}&\cTab{0.00}&\cTab{0.00}
&\cTab{-63.96}&\cTab{-3.31}&\cTab{0.00}&\cTab{-0.00}
&\cTab{-54.82}&\cTab{-1.55}&\cTab{-0.00}&\cTab{0.00}
&\cTab{-17.63}&\cTab{-12.91}&\cTab{-0.31}&\cTab{0.00}
&\cTab{-14.91}&\cTab{-10.70}&\cTab{-0.09}&\cTab{-0.00}
&\cTab{-9.96}&\cTab{-7.00}&\cTab{-0.00}&\cTab{-0.00}
&\cTab{-4.82}&\cTab{-2.25}&\cTab{0.00}&\cTab{0.00}
&\cTab{-21.27}&\cTab{-16.01}&\cTab{-0.42}&\cTab{-0.00}
&\cTab{-22.14}&\cTab{-17.26}&\cTab{-0.18}&\cTab{0.00}
&\cTab{-19.66}&\cTab{-15.21}&\cTab{-0.08}&\cTab{0.00}
&\cTab{-10.03}&\cTab{-7.17}&\cTab{-0.01}&\cTab{-0.00}


In [8]:
# Exponent analysis table generation,whole.
precisions=[52]
exponents= [3,4,5,11]
f1 = np.zeros((len(precisions), len(exponents), 98)) # Number of classifiers in models.csv
for instr in ['whole']:#['node', 'whole']:
    for pi, p in enumerate(precisions):
        for ei, e in enumerate(exponents):
            file1 = np.genfromtxt('{}_results/whole_results/{}_output_{}'.format(instr, e, p), delimiter=',', dtype=float)
            for atts in file1:
                if (atts[2]==84800 and atts[0]>=84) or (atts[2]==14200 and atts[0]<84): #difference in datasets
                    f1[pi,ei,int(atts[0])]+= atts[-2]
f1/=30
arr = np.zeros((2,3,4,4))
for k in range(12):
    for e, b in enumerate(f1):
        for p, a in enumerate(b):
            arr[0][k//4][k%4][p]=(a[:12]-f1[-1][-1][:12])[k]
            arr[1][k//4][k%4][p]=(a[84:96]-f1[-1][-1][84:96])[k]
for i in range(2):
    for j in range(3):
        for k in range(4):
            print("&\cTab{{{:.4f}}}&\cTab{{{:.4f}}}&\cTab{{{:.4f}}}".format(arr[i][j][k][0],arr[i][j][k][1],arr[i][j][k][2]))

&\cTab{-0.4628}&\cTab{-0.0006}&\cTab{-0.0000}
&\cTab{-0.5590}&\cTab{-0.0137}&\cTab{0.0000}
&\cTab{-0.5404}&\cTab{-0.0309}&\cTab{-0.0000}
&\cTab{-0.3984}&\cTab{-0.1729}&\cTab{0.0000}
&\cTab{-0.4840}&\cTab{-0.0002}&\cTab{0.0000}
&\cTab{-0.6130}&\cTab{-0.0009}&\cTab{0.0000}
&\cTab{-0.5937}&\cTab{-0.0078}&\cTab{-0.0000}
&\cTab{-0.4798}&\cTab{-0.1351}&\cTab{-0.0000}
&\cTab{-0.4923}&\cTab{-0.0001}&\cTab{-0.0000}
&\cTab{-0.6571}&\cTab{-0.0031}&\cTab{0.0000}
&\cTab{-0.6586}&\cTab{-0.0091}&\cTab{0.0000}
&\cTab{-0.5590}&\cTab{-0.1388}&\cTab{-0.0000}
&\cTab{-0.1856}&\cTab{-0.0121}&\cTab{-0.0000}
&\cTab{-0.1552}&\cTab{-0.0389}&\cTab{0.0000}
&\cTab{-0.1069}&\cTab{-0.0260}&\cTab{-0.0000}
&\cTab{-0.0548}&\cTab{-0.0260}&\cTab{0.0000}
&\cTab{-0.2066}&\cTab{0.0088}&\cTab{-0.0000}
&\cTab{-0.1874}&\cTab{-0.0017}&\cTab{0.0000}
&\cTab{-0.1577}&\cTab{-0.0235}&\cTab{-0.0000}
&\cTab{-0.0753}&\cTab{-0.0220}&\cTab{0.0000}
&\cTab{-0.2237}&\cTab{0.0159}&\cTab{0.0000}
&\cTab{-0.2321}&\cTab{0.0360}&\cTab{-0.0000}
&\

## 4. TABLE 3: F1  score  differences  with  reduced  mantissa  precision
Modify the instrumentation as necessary, between 'node' and 'whole'. This is meant for Latex input, observe arr if necessary.

In [23]:
# Precision analysis table generation.
instrumentation='whole'
import numpy as np
precisions=[1,2,3,4,5,6,52]
exponents= [11]
f1 = np.zeros((len(precisions), len(exponents), 98)) # Number of classifiers in models.csv
for instr in [instrumentation]:#['node', 'whole']:
    for pi, p in enumerate(precisions):
        for ei, e in enumerate(exponents):
            file1 = np.genfromtxt('{}_results/{}_output_{}'.format(instr, e, p), delimiter=',', dtype=float)
            for atts in file1:
                if (atts[2]==84800 and atts[0]>=84) or (atts[2]==14200 and atts[0]<84): #difference in datasets
                    f1[pi,ei,int(atts[0])]+= atts[-2]
f1/=30
arr = np.zeros((2,3,4,7))
for k in range(12):
    for p, b in enumerate(f1):
        for e, a in enumerate(b):
            arr[0][k//4][k%4][p]=(a[:12]-f1[-1][-1][:12])[k]
            arr[1][k//4][k%4][p]=(a[84:96]-f1[-1][-1][84:96])[k]
for i in range(2):
    for j in range(3):
        for k in range(4):
            print("&\dTab{{{:.4f}}}&\dTab{{{:.4f}}}&\dTab{{{:.4f}}}&\dTab{{{:.4f}}}&\dTab{{{:.4f}}}&\dTab{{{:.4f}}}".format(arr[i][j][k][0],arr[i][j][k][1],arr[i][j][k][2],arr[i][j][k][3],arr[i][j][k][4],arr[i][j][k][5],arr[i][j][k][6]))

&\dTab{-0.0755}&\dTab{-0.0570}&\dTab{-0.0330}&\dTab{-0.0239}&\dTab{-0.0072}&\dTab{-0.0064}
&\dTab{-0.0935}&\dTab{-0.0603}&\dTab{-0.0272}&\dTab{-0.0124}&\dTab{-0.0055}&\dTab{-0.0059}
&\dTab{-0.1150}&\dTab{-0.0510}&\dTab{-0.0172}&\dTab{-0.0090}&\dTab{-0.0038}&\dTab{-0.0041}
&\dTab{-0.2147}&\dTab{-0.1505}&\dTab{-0.0773}&\dTab{-0.0111}&\dTab{0.0020}&\dTab{0.0011}
&\dTab{-0.0705}&\dTab{-0.0647}&\dTab{-0.0271}&\dTab{-0.0193}&\dTab{-0.0165}&\dTab{0.0016}
&\dTab{-0.1087}&\dTab{-0.0678}&\dTab{-0.0309}&\dTab{-0.0177}&\dTab{-0.0098}&\dTab{-0.0069}
&\dTab{-0.1203}&\dTab{-0.0624}&\dTab{-0.0217}&\dTab{-0.0107}&\dTab{-0.0011}&\dTab{-0.0030}
&\dTab{-0.2223}&\dTab{-0.1351}&\dTab{-0.0513}&\dTab{-0.0049}&\dTab{0.0025}&\dTab{0.0001}
&\dTab{-0.0737}&\dTab{-0.0660}&\dTab{-0.0331}&\dTab{-0.0348}&\dTab{-0.0127}&\dTab{-0.0051}
&\dTab{-0.1038}&\dTab{-0.0729}&\dTab{-0.0294}&\dTab{-0.0149}&\dTab{-0.0106}&\dTab{-0.0068}
&\dTab{-0.1228}&\dTab{-0.0648}&\dTab{-0.0199}&\dTab{-0.0122}&\dTab{-0.0033}&\dTab{-0.0042}
&\dT

## Data extraction for data analysis.
All plots in the results zip that are in the node or whole folders are used for data analysis. All data is plotted for give much more information. These plots are not used in the paper, but can be useful for in-depth analysis.

### Utility functions. Adapt if needed to the metrics wanted in the plots.

In [2]:
def print_results(output, output_runs, models, output_directory="."):
    def add_key(key):
        name = models[0][key]['name']
        if name == 'Mondrian':
            ram_count = str(int(models[0][key]['memory_size']) / 600000)
            return (name + ' ' + models[0][key]['tree_count'] + ' tree(s) (RAM x' + ram_count + ')', key, (name + ' (RAM x' + ram_count + ')', int(models[0][key]['tree_count'])))
        elif name == 'MCNN':
            if models[key]['cleaning'] == '1':
                return ('MCNN Origin ' + models[0][key]['cluster_count'] + ' clusters', key, ('MCNN Origin', int(models[0][key]['cluster_count'])))
            elif models[key]['cleaning'] == '2':
                return ('MCNN Mixe ' + models[0][key]['cluster_count'] + ' clusters', key, ('MCNN Mixe', int(models[0][key]['cluster_count'])))
            else:
                return ('MCNN OrpailleCC ' + models[0][key]['cluster_count'] + ' clusters', key, ('MCNN OrpailleCC', int(models[0][key]['cluster_count'])))
        else:
            return (name, key, (name, 0))
    ####### Controlling the datasets 1/4 ########
    knn_offline_f1 = {'banos_6': 0.86, 'banos_6_v1': 0.86, 'banos_6_v2': 0.86, 'banos_6_v3': 0.86, 'banos_6_v4': 0.86, 'banos_6_v5': 0.86, 'banos_6_v6': 0.86, 'banos_6_v7': 0.86, 'recofit_6': 0.40, 'drift_6' : 0.86}
    #(print name, key in models, tuple for sorting)
    keys = [add_key(key) for key in models[0] if models[0][key]['fullname'] != 'Previous']
    #grounp the third and second value to use dict to do a unique
    keys = dict([(key[0], (key[1], key[2])) for key in keys]).items()
    #Unpack the value part to separate the key and the sorting tuple
    keys = [(key[0], key[1][0], key[1][1]) for key in keys]
    keys = sorted(keys, key = lambda x: x[2])
    names = [key[0] for key in keys]
    print(names)
    plt.rcParams.update({'font.size': 33})
    ####### Controlling the datasets 2/4 ########
    #list_datastets = ['banos_6']
    list_datasets = ['banos_6', 'banos_6_v1', 'banos_6_v2', 'banos_6_v3','banos_6_v4', 'banos_6_v5', 'banos_6_v6','recofit_6']
    for dataset_name in list_datasets:
        print('Dataset: ' + dataset_name)
        daty, daty_std = [], []
        for i, precision in enumerate(PRECISIONS):
            datytmp = output[i][output[i].file.str.contains(dataset_name)]
            daty_stdtmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).std().reset_index()
            datytmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).mean().reset_index()
            daty.append(datytmp)
            daty_std.append(daty_stdtmp)
        #ACCURACY
        for ram in ['1.0)','2.0)', '5.0)']:
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, precision in enumerate(PRECISIONS):
                        style, color = line_style(name, precision)
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'], daty[i][daty[i].fullname == name]['accuracy'], markevery=0.1, markersize=15, label="{}, {}".format(precision,name[8:-19]), color=color, linestyle=style)
            plt.legend(prop={"size":20},bbox_to_anchor=(2, 1), ncol=8)
            x = [a for a in daty[i][daty[i].fullname == name]['element_count']]
            y = [knn_offline_f1[dataset_name] for a in daty[i][daty[i].fullname == name]['element_count']]
            plt.plot(x, y, color='#000000', linestyle='-.', label='kNN Offline')
            plt.ylim(0,1)
            plt.ylabel("Accuracy")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/" + dataset_name + "_accuracy_RAM(x{}.png".format(ram))
            plt.clf()
            plt.close()
            #F1 Score DIF
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, precision in enumerate(PRECISIONS):
                        style, color = line_style(name, precision)
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'],daty[i][daty[i].fullname == name]['f1']-daty[-1][daty[-1].fullname == name]['f1'], markevery=0.1, markersize=15, label="{}, {}".format(precision,name[8:-19]), color=color, linestyle=style)
            plt.ylabel("F1 score difference")
            plt.xlabel("Element")
            plt.ylim(-0.21,0.05)
            plt.tight_layout()
            plt.savefig(output_directory + "/" + dataset_name + "_f1_difference_RAM(x{}.png".format(ram))
            plt.clf()
            plt.close()
            #F1
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, precision in enumerate(PRECISIONS):
                        style, color = line_style(name, precision)
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'], daty[i][daty[i].fullname == name]['f1'], markevery=0.1, markersize=15, label="{}, {}".format(precision,name[8:-19]), color=color, linestyle=style)
            x = [a for a in daty[i][daty[i].fullname == name]['element_count']]
            y = [knn_offline_f1[dataset_name] for a in daty[i][daty[i].fullname == name]['element_count']]
            plt.plot(x, y, color='#000000', linestyle='-.', label='kNN Offline')
            #plt.legend(prop={"size":20},ncol=4)
            plt.ylim(0,1)
            plt.ylabel("F1")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/" + dataset_name + "_f1_RAM(x{}.png".format(ram))
            plt.clf()
            plt.close()
            #F1 dev
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, precision in enumerate(PRECISIONS):
                        style, color = line_style(name, precision)
                        y1 = daty[i][daty[i].fullname == name]['f1'] - daty_std[i][daty_std[i].fullname == name]['f1']
                        y2 = daty[i][daty[i].fullname == name]['f1'] + daty_std[i][daty_std[i].fullname == name]['f1']
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'], daty[i][daty[i].fullname == name]['f1'], color=color, linestyle=style, label="{}, {}".format(precision,name[8:-19]), markevery=0.1, markersize=15)
                        plt.fill_between(daty_std[i][daty_std[i].fullname == name]['element_count'], y1, y2, color=color, linestyle=style, alpha=0.05)
            x = [a for a in daty[i][daty[i].fullname == name]['element_count']]
            y = [knn_offline_f1[dataset_name] for a in daty[i][daty[i].fullname == name]['element_count']]
            plt.plot(x, y, color='#000000', linestyle='-.', label='kNN Offline')
            #plt.legend(prop={"size":20},ncol=4)
            plt.ylim(0,1)
            plt.ylabel("F1 +- Deviation")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/" + dataset_name + "_f1_dev_RAM(x{}.png".format(ram))
            plt.clf()
            #Dev only of F1
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, precision in enumerate(PRECISIONS):
                        style, color = line_style(name, precision)
                        plt.plot(daty_std[i][daty_std[i].fullname == name]['element_count'], daty_std[i][daty_std[i].fullname == name]['f1'], color=color, linestyle=style, label="{}, {}".format(precision,name[8:-19]), markevery=0.1, markersize=15)
            #plt.legend(prop={"size":20},ncol=4)
            plt.ylabel("F1 Deviation")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/" + dataset_name + "_dev_RAM(x{}.png".format(ram))
            plt.clf()
            plt.close()
    daty_52, daty_std_52 = [], []
    for i, dataset in enumerate(list_datasets[:-1]):
        datytmp = output[-1][output[-1].file.str.contains(dataset)]
        daty_stdtmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).std().reset_index()
        datytmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).mean().reset_index()
        daty_52.append(datytmp)
        daty_std_52.append(daty_stdtmp)
    
    for precision in [1,2,3,4,5,6,7,8]:
        print('Precision: {}'.format(precision))
        daty, daty_std = [], []
        for i, dataset in enumerate(list_datasets[:-1]):
            datytmp = output[precision-1][output[precision-1].file.str.contains(dataset)]
            daty_stdtmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).std().reset_index()
            datytmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).mean().reset_index()
            daty.append(datytmp)
            daty_std.append(daty_stdtmp)
            plt.close()
        #F1
        for ram in ['1.0)','2.0)', '5.0)']:
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, dataset in enumerate(list_datasets[:-1]):
                        if int(name[9:-19])==1:
                            style='-'
                        elif int(name[9:-19])==5:
                            style='--'
                        elif int(name[9:-19])==10:
                            style='-.'
                        elif int(name[9:-19])==50:
                            style=':'
                        if i>=7:
                            color='royalblue'
                        elif i>=6:
                            color='green'
                        elif i>=5:
                            color='yellowgreen'
                        elif i>=4:
                            color='gold'
                        elif i>=3:
                            color='orange'
                        elif i>=2:
                            color='chocolate'
                        elif i>=1:
                            color='red'
                        else:
                            color='violet'
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'], daty[i][daty[i].fullname == name]['f1'], markevery=0.1, markersize=15, label="{}, {}".format(dataset, name[8:-19]), color=color, linestyle=style)
            x = [a for a in daty[i][daty[i].fullname == name]['element_count']]
            y = [knn_offline_f1[dataset] for a in daty[i][daty[i].fullname == name]['element_count']]
            plt.plot(x, y, color='#000000', linestyle='-.', label='kNN Offline')
            plt.legend(prop={"size":20},ncol=4)
            plt.ylim(0,1)
            plt.ylabel("F1")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/precision_{}_f1_RAM(x{}.png".format(precision,ram))
            plt.clf()
            plt.close()
        #F1 Dif
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, dataset in enumerate(list_datasets[:-1]):
                        if int(name[9:-19])==1:
                            style='-'
                        elif int(name[9:-19])==5:
                            style='--'
                        elif int(name[9:-19])==10:
                            style='-.'
                        elif int(name[9:-19])==50:
                            style=':'
                        if i>=7:
                            color='royalblue'
                        elif i>=6:
                            color='green'
                        elif i>=5:
                            color='yellowgreen'
                        elif i>=4:
                            color='gold'
                        elif i>=3:
                            color='orange'
                        elif i>=2:
                            color='chocolate'
                        elif i>=1:
                            color='red'
                        else:
                            color='violet'
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'], daty[i][daty[i].fullname == name]['f1']-daty_52[i][daty_52[i].fullname == name]['f1'], markevery=0.1, markersize=15, label="{}, {}".format(dataset, name[8:-19]), color=color, linestyle=style)
            plt.legend(prop={"size":20},ncol=4)
            plt.ylabel("F1 score difference")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/precision_{}_f1_difference_RAM(x{}.png".format(precision,ram))
            plt.clf()
            plt.close()
def print_exponents(output, output_runs, models, output_directory="."):
    ####### Controlling the datasets 1/4 ########
    knn_offline_f1 = {'banos_6': 0.86, 'banos_6_v1': 0.86, 'banos_6_v2': 0.86, 'banos_6_v3': 0.86, 'banos_6_v4': 0.86, 'banos_6_v5': 0.86, 'banos_6_v6': 0.86, 'banos_6_v7': 0.86, 'recofit_6': 0.40, 'drift_6' : 0.86}
    #(print name, key in models, tuple for sorting)
    keys = [add_key(key) for key in models[0] if models[0][key]['fullname'] != 'Previous']
    #grounp the third and second value to use dict to do a unique
    keys = dict([(key[0], (key[1], key[2])) for key in keys]).items()
    #Unpack the value part to separate the key and the sorting tuple
    keys = [(key[0], key[1][0], key[1][1]) for key in keys]
    keys = sorted(keys, key = lambda x: x[2])
    names = [key[0] for key in keys]
    print(names)
    plt.rcParams.update({'font.size': 33})
    ####### Controlling the datasets 2/4 ########
    list_datasets = ['banos_6', 'banos_6_v1', 'banos_6_v2', 'banos_6_v3','banos_6_v4', 'banos_6_v5', 'banos_6_v6','recofit_6']
    for dataset_name in list_datasets:
        print('Dataset: ' + dataset_name)
        daty, daty_std = [], []
        for i, exponent in enumerate(EXPONENTS):
            datytmp = output[i][output[i].file.str.contains(dataset_name)]
            daty_stdtmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).std().reset_index()
            datytmp = datytmp[['fullname', 'element_count', 'f1', 'accuracy', 'memory']].groupby(['fullname', 'element_count']).mean().reset_index()
            daty.append(datytmp)
            daty_std.append(daty_stdtmp)
        for ram in ['1.0)','2.0)','5.0)']:
            #F1 Score DIF
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name==ram:
                    for i, exponent  in enumerate(EXPONENTS):
                        style, color = line_style(name, exponent)
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'],daty[i][daty[i].fullname == name]['f1']-daty[-1][daty[-1].fullname == name]['f1'], markevery=0.1, markersize=15, label="{}, {}".format(exponent,name[8:-19]), color=color, linestyle=style)
            plt.legend(prop={"size":20})
            plt.ylabel("F1 score difference")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/" + dataset_name + "_f1_difference_RAM(x{}.png".format(ram))
            plt.clf()
            plt.close()
            #F1
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, exponent in enumerate(EXPONENTS):
                        style, color = line_style(name, exponent)
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'], daty[i][daty[i].fullname == name]['f1'], markevery=0.1, markersize=15, label="{}, {}".format(exponent,name[8:-19]), color=color, linestyle=style)
            x = [a for a in daty[i][daty[i].fullname == name]['element_count']]
            y = [knn_offline_f1[dataset_name] for a in daty[i][daty[i].fullname == name]['element_count']]
            plt.plot(x, y, color='#000000', linestyle='-.', label='kNN Offline')
            plt.legend(prop={"size":20})
            plt.ylim(0,1)
            plt.ylabel("F1")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/" + dataset_name + "_f1_RAM(x{}.png".format(ram))
            plt.clf()
            plt.close()
            #F1 dev
            fig = plt.figure(figsize=(23.38582, 16.53544))
            for name in names:
                if name[-4:]==ram:
                    for i, exponent in enumerate(EXPONENTS):
                        style, color = line_style(name, exponent)
                        y1 = daty[i][daty[i].fullname == name]['f1'] - daty_std[i][daty_std[i].fullname == name]['f1']
                        y2 = daty[i][daty[i].fullname == name]['f1'] + daty_std[i][daty_std[i].fullname == name]['f1']
                        plt.plot(daty[i][daty[i].fullname == name]['element_count'], daty[i][daty[i].fullname == name]['f1'], color=color, linestyle=style, label="{}, {}".format(exponent,name[8:-19]), markevery=0.1, markersize=15)
                        plt.fill_between(daty_std[i][daty_std[i].fullname == name]['element_count'], y1, y2, color=color, linestyle=style, alpha=0.05)
            x = [a for a in daty[i][daty[i].fullname == name]['element_count']]
            y = [knn_offline_f1[dataset_name] for a in daty[i][daty[i].fullname == name]['element_count']]
            plt.plot(x, y, color='#000000', linestyle='-.', label='kNN Offline')
            plt.legend(prop={"size":20})
            plt.ylim(0,1)
            plt.ylabel("F1 +- Deviation")
            plt.xlabel("Element")
            plt.tight_layout()
            plt.savefig(output_directory + "/" + dataset_name + "_f1_dev_RAM(x{}.png".format(ram))
            plt.clf()

### Result plots for mantissa precision comparison and ordering comparison.

In [3]:
# Main tool used for data analysis.
ins = 'node'
dir_name = '{}_results'.format(ins)
exp = '11'
out_name = '{}_{}_plots'.format(exp, ins)
output_lst, output_run_lst, models_lst = [], [], []
PRECISIONS = [i for i in range(1,52)]
for i in PRECISIONS:
    output, output_runs, models = process_output("{}/{}_output_{}".format(dir_name,exp,i),"{}/{}_output_runs_{}".format(dir_name,exp,i),"{}/{}_models_{}.csv".format(dir_name,exp,i))
    output_lst.append(output)
    output_run_lst.append(output_runs)
    models_lst.append(models)
print_results(output_lst, output_run_lst, models_lst, output_directory=out_name)

### Result plots for exponent precision comparison.

In [None]:
#plotting the exponents all on one graph
dir_name = 'exp_whole_results'
precision = '52'
out_name = 'exp_whole_plots'
output_lst, output_run_lst, models_lst = [], [], []
EXPONENTS = range(3,12)
for e in EXPONENTS:
    output, output_runs, models = process_output("{}/{}_output_{}".format(dir_name,e,precision),"{}/{}_output_runs_{}".format(dir_name,e,precision),"{}/{}_models_{}.csv".format(dir_name,e,precision))
    output_lst.append(output)
    output_run_lst.append(output_runs)
    models_lst.append(models)
print_exponents(output_lst, output_run_lst, models_lst, output_directory=out_name)