In [2]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
from itertools import product

In [3]:
# env variables
REPO_PATH = '/home/ah19/runtime-monitoring'
DATASET = 'FashionMNIST'
PREFIX = 'Adam-128-100'
FILENAME_POSTFIX = f'{DATASET}_{PREFIX}'
DATA_FALVOR = 'raw'

In [4]:
import os
os.chdir('../..')
from utilities.utils import load_json
from utilities.pathManager import fetchPaths

In [19]:
# paths
base = Path(REPO_PATH)
paths = fetchPaths(base, DATASET)

path = paths[DATASET.lower()]
path_bdd = paths['bdd_' + DATA_FALVOR] / FILENAME_POSTFIX

path_lastHiddenLayer_pca = paths['lastHiddenLayer_pca']
path_lastHiddenLayer_pca_single = path_lastHiddenLayer_pca / FILENAME_POSTFIX / 'Single'
# path_lastHiddenLayer_pca_classes = path_lastHiddenLayer_pca / FILENAME_POSTFIX / 'Classes'

save_path = paths['bdd_testingThresholds_' + DATA_FALVOR] / FILENAME_POSTFIX


path_lastHiddenLayer = paths['lastHiddenLayer_' + DATA_FALVOR] / FILENAME_POSTFIX

In [6]:

# import Data
print('Loading train Data ...')
df = pd.read_csv(path_lastHiddenLayer / f'{FILENAME_POSTFIX}_train.csv')

# split train data
df_true = df[df['true'] == True].copy()
df_true = df_true.drop('true', axis=1).reset_index(drop=True)

print('Loading test Data ...')
df_test = pd.read_csv(path_lastHiddenLayer / f'{FILENAME_POSTFIX}_test.csv')


print('Loading Neurons ...')
neurons = load_json(path_lastHiddenLayer_pca_single / f'{FILENAME_POSTFIX}_neurons.json')

# neurons = None

Loading train Data ...
Loading test Data ...
Loading Neurons ...


In [21]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from dd.autoref import BDD
from pympler import asizeof
import time

class MonitorBDD:
    def __init__(self, num_neurons, thld_1, thld_2=None, thld_3=None, neurons=None, max_time=0, verbose=False):

        self.bdd = BDD()
        self.roots = self.bdd.false
        self.num_neurons = num_neurons
        self.verbose = verbose
        self.max_time = max_time

        self.thld_1 = thld_1
        self.thld_2 = thld_2
        self.thld_3 = thld_3
        self.num_bits = 2 if thld_2 is not None or thld_3 is not None else 1
        self.num_vars = num_neurons * self.num_bits

        self.neurons = neurons
        if self.neurons is not None:
            self.neurons = np.array([int(n[1:]) for n in neurons])
            self.thld_1 = thld_1[self.neurons]
            self.thld_2 = thld_2[self.neurons] if self.thld_2 is not None else None
            self.thld_3 = thld_3[self.neurons] if self.thld_3 is not None else None

        self.vars, self.vars_not = self.__declare_vars()

        self.stats = pd.DataFrame({
            'thld': [],
            'df': [],
            'build_time': [],
            'size_before_reorder_mb': [],
            'reorder_time': [],
            'size_after_reorder_mb': [],
            'successful': []
        })


    def __declare_vars(self):
        """TODO"""
        # generate vars either x0_0 or x0_0 and x0_1 per neuron
        vars_range = self.neurons if self.neurons is not None else range(self.num_neurons)
        v = [f'x{n}_{i}' for i in range(self.num_bits) for n in vars_range]

        # add vars to bdd
        [ *map(self.bdd.add_var, v) ]

        # generate negative vars
        vars = np.array([ *map(self.bdd.var, v) ])
        vars_not = np.array([ ~v for v in vars ])

        return vars, vars_not


    def __multi_thlds(self, x):
        """TODO"""
        x = x.reshape(x.shape[0], 1)
        x = np.where(x == [0], [0, 0], # 0
             np.where(x == [1], [1, 0], # 1
             np.where(x == [2], [0, 1], # 2
             [1, 1] ) ) )# 3
        return np.reshape(x, x.shape[0] * 2)


    def __applying_thlds(self, df):
        """TODO"""
        df_thld = (df >=  self.thld_1).astype('int8')

        if self.thld_2 is not None:
            df_thld += (df >=  self.thld_2).astype('int8')

        if self.thld_3 is not None:
            df_thld += (df >=  self.thld_3).astype('int8')

        return df_thld.to_numpy()


    def check_pattern_length(self, row):
        """TODO"""
        if self.num_bits == 2:
            assert len(self.vars)/2 == row.shape[0], "ERROR: VARS and ROW do not match!"
        else:
            assert len(self.vars) == row.shape[0], "ERROR: VARS and ROW do not match!"


    def construct_one_pattern(self, row):
        """TODO"""
        # replace 1 with vars and 0 with vars_not
        expr = np.where( row == 1, self.vars, self.vars_not )
        return np.bitwise_and.reduce(expr)
    

    def __add_patterns(self, rows):
        """TODO"""
        self.roots |= np.bitwise_or.reduce( np.apply_along_axis(self.construct_one_pattern, 1, rows) )



    def flip_bit(self, patterns, eta):
        """flip n-th bit to allow more freedom(false positive)
           if et = 0 then pattern as is
           if et = 1 then loop over each bit and force it to one
           et = 2 loop over 2 bits and flip them ... etc
           drop any duplicate patterns"""

        for nth in range(patterns.shape[1]-eta+1):
            temp = patterns.copy()
            temp[:, nth:nth+eta] = 1
            temp = np.unique(temp, axis=0)
            yield temp


    def add_dataframe(self, df, eta=0, eval_dfs=None):
        """TODO"""
        start = time.perf_counter()
        if self.neurons is not None:
            df = df[df.columns[self.neurons]].drop_duplicates()
        else:
            df = df[df.columns[:self.num_neurons]].drop_duplicates()

        patterns = self.__applying_thlds(df)

        if self.num_bits == 2:
            patterns = np.apply_along_axis(self.__multi_thlds, 1, patterns)

        self.__add_patterns(patterns)
        
        
        build_time = round(time.perf_counter() - start, 3)


        row = self.stats.shape[0]+1
        self.stats.loc[row, 'df'] = 0
        
        
        self.stats.loc[row, 'build_time'] = build_time
        self.stats.loc[row, 'size_before_reorder_mb'] = round( asizeof.asizeof(self) * 1e-6, 3)
        

        start = time.perf_counter()
        BDD.reorder(self.bdd)
        bdd_reorder_time = round(time.perf_counter() - start, 3)

        self.stats.loc[row, 'reorder_time'] = bdd_reorder_time
        self.stats.loc[row, 'size_after_reorder_mb'] = round( asizeof.asizeof(self) * 1e-6, 3)

        # add column for scoring
        if eval_dfs is not None:
            for eval_df in eval_dfs:
                self.evaluate_dataframe(eval_df, 0)

        # return evaluated dataframes
        if eval_dfs is not None:
            return eval_dfs

        self.stats = self.stats.loc[self.stats['df'] == eta]
        return


    def check_one_pattern(self, row):
        """TODO"""
        if (self.roots & self.construct_one_pattern(row) ) == self.bdd.false:
            return 0 # means not found
        else:
            return 1 # found it


    def evaluate_dataframe(self, df, eta=None):
        """TODO"""
        bdd_results = np.zeros(df.shape[0], dtype=np.int8)

        if self.neurons is not None:
            patterns = self.__applying_thlds(df[df.columns[self.neurons]])
        else:
            patterns = self.__applying_thlds(df[df.columns[:self.num_neurons]])

        if self.num_bits == 2:
            patterns = np.apply_along_axis(self.__multi_thlds, 1, patterns)

        bdd_results = np.apply_along_axis(self.check_one_pattern, 1, patterns)

        if eta is not None:
            df[f'bdd_{eta}'] = bdd_results
            return

        # if the function called specifically, return scored df after evaluating
        df['bdd'] = bdd_results

        return self.score_dataframe(df)


    def score_dataframe(self, df, bdd_col='bdd'):
        """TODO"""
        
        if bdd_col not in df.columns:
            return pd.DataFrame({
                        'y': []
                        ,'count': []
                        ,'_false': []
                        ,'_false_miss_classified': []
                        ,'outOfPattern': []
                        ,'outOfPatternMissClassified': []
                        ,'eta':[]
                    })
        
        df_all_classes = df[['y', 'true']].groupby('y').count().sort_index()
        df_all_classes.columns = ['count']

        df_out_of_pattern_images = df.loc[df[bdd_col] == 0, ['y', bdd_col]].groupby('y').count().sort_index()
        df_out_of_pattern_images.columns = [bdd_col + '_false']

        df_out_of_pattern_misclassified_images = df.loc[(df[bdd_col] == 0) & (df['true'] == False), ['y', bdd_col]].groupby('y').count().sort_index()
        df_out_of_pattern_misclassified_images.columns = [bdd_col + '_false_miss_classified']

        df_scores = df_all_classes.join(df_out_of_pattern_images).join(df_out_of_pattern_misclassified_images)

        del df_out_of_pattern_images, df_out_of_pattern_misclassified_images

        total_images = df_all_classes['count'].sum()
        out_of_pattern_images = (df[bdd_col] == 0).sum()
        out_of_pattern_misclassified_images = ((df['true'] == False) & (df[bdd_col] == 0)).sum()
        df_scores.loc['all', :] = [total_images, out_of_pattern_images, out_of_pattern_misclassified_images]

        # if data frame return 0 rows, a nan will be placed
        df_scores.fillna(0, inplace=True)

        # calculate metrics
        df_scores['outOfPattern'] = df_scores[bdd_col + '_false'] / df_scores['count']
        df_scores['outOfPatternMissClassified'] = df_scores[bdd_col + '_false_miss_classified'] / df_scores[bdd_col + '_false']

        # add mean of all classes
        a1 = df_scores.loc[df_scores.index != 'all', 'outOfPattern'].mean()
        a2 = df_scores.loc[df_scores.index != 'all', 'outOfPatternMissClassified'].mean()
        df_scores.loc['all_mean', :] = [0, 0, total_images, a1, a2]

        # if class is never missclassified and bdd recognize all of his patterns
        # both outOfPattern and outOfPatternMissClassified will be 0
        # so the division will result in NaN
        df_scores['outOfPatternMissClassified'].replace({np.nan:0.0, 0.0:1.0}, inplace=True)
        df_scores['outOfPattern'].replace({np.nan:0.0}, inplace=True)

        # no missclassification for a class
        df_scores[bdd_col + '_false'].replace({np.nan:0.0}, inplace=True)
        df_scores[bdd_col + '_false_miss_classified'].replace({np.nan:0.0}, inplace=True)

        if bdd_col=='bdd':
            return df_scores.reset_index()
        return df_scores


    def score_dataframe_multi_eta(self, df, eta):
        """TODO"""
        df_scores = pd.DataFrame()

        for et in range(eta+1):
            temp = self.score_dataframe(df, f'bdd_{et}')
            temp['eta'] = et
            temp.columns = [*map(lambda x: x.replace(f'bdd_{et}', ''), temp.columns)]
            df_scores = pd.concat([df_scores, temp])
            del temp

        return df_scores.reset_index()


    def plot_stats(self, df, stage, true=True, save_folder=None, prefix=None):
        """TODO"""
        df = df.loc[df['true'] == true].set_index('y')
        mean_ = df['outOfPattern'].mean().round(3)

        neurons = f' Number of neuron: {len(self.neurons)}' if self.neurons is not None else None
        title = f'{stage} - {true} - #out of pattern: {mean_}{neurons}\n'
        filename = f'{stage.lower()}_{true}_outOfPattern'
        color = 'teal' if true else 'orange'

        df['outOfPattern'].plot(
            kind='bar', title=title, legend=False, color=color, xlabel='', hatch='x', edgecolor='black')
        plt.yticks(np.arange(0, 1.1, 0.1))
        plt.axhline(mean_, color='red', linewidth=2, linestyle='--')


        if save_folder is not None:
            plt.savefig(save_folder / f'bdd_scores_{filename}_{prefix}.jpg', dpi=150, transparent=False)

        plt.show()


In [22]:
# define threshold

p = 0.3

thld = np.quantile(df_true.drop('y', axis=1), p, axis=0)
thld_name = f'qth_{p}'

# degree of freedom
eta = 0

In [23]:
patterns = MonitorBDD( df_true.shape[1]-1, thld, neurons=neurons, max_time=10 )
df_2, df_test_2 = patterns.add_dataframe( df_true, eta, eval_dfs=[df.copy(), df_test.copy()] )

In [26]:
patterns.stats

Unnamed: 0,thld,df,build_time,size_before_reorder_mb,reorder_time,size_after_reorder_mb,successful
1,,0.0,854.316,24582.464,4460.514,8151.266,


In [25]:
score = patterns.score_dataframe_multi_eta(df_test_2, eta)

score

Unnamed: 0,y,count,_false,_false_miss_classified,outOfPattern,outOfPatternMissClassified,eta
0,0,1000.0,996.0,150.0,0.996,0.150602,0
1,1,1000.0,780.0,18.0,0.78,0.023077,0
2,2,1000.0,999.0,103.0,0.999,0.103103,0
3,3,1000.0,939.0,49.0,0.939,0.052183,0
4,4,1000.0,995.0,65.0,0.995,0.065327,0
5,5,1000.0,942.0,8.0,0.942,0.008493,0
6,6,1000.0,993.0,193.0,0.993,0.194361,0
7,7,1000.0,957.0,27.0,0.957,0.028213,0
8,8,1000.0,950.0,11.0,0.95,0.011579,0
9,9,1000.0,962.0,28.0,0.962,0.029106,0
