In [153]:
import os
import pandas as pd

import sklearn.ensemble as ske

from bounos.Analyses.Weight import summed_outliers_per_weight

def categorise_dataframe(df):
    # Categories work better as indexes
    for obj_key in df.keys()[df.dtypes == object]:
        try:
            df[obj_key] = df[obj_key].astype('category')
        except TypeError:
            print("Couldn't categorise {}".format(obj_key))
            pass
    return df

def non_zero_rows(df):
    return df[~(df==0).all(axis=1)]

The questions we want to answer are:
1. What metrics differentiate between what behaviours
2. Do these metrics cross over domains (i.e. comms impacting behaviour etc)

To answer these questions we first have to manipulate the raw dataframe to be weight-indexed with behaviour(`var`) keys on the perspective from the observer to the (potential) attacker in a particular run (summed across the timespace)

While this analysis simply sums both the upper and lower outliers, **this needs extended/readdressed**

# IMPORTANT 
The July 3rd Simulation Run had a small mistake where the Ran

In [154]:
observer = 'Bravo'
target = 'Alfa'
n_nodes = 6
n_metrics = 9

results_path = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2015-07-03-16-45-26"
results_path = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2015-07-20-17-47-53"
shared_h5_path = '/dev/shm/shared.h5'

with pd.get_store(shared_h5_path) as store:
    joined_target_weights = store.get('joined_target_weights')
    joined_feats = store.get('joined_feats')
    comms_only_feats = store.get('comms_only_feats')
    phys_only_feats = store.get('phys_only_feats')
    comms_only_weights = store.get('comms_only_weights')
    phys_only_weights = store.get('phys_only_weights')

joined_feat_weights = categorise_dataframe(non_zero_rows(joined_feats).T)
comms_feat_weights = categorise_dataframe(non_zero_rows(comms_only_feats).T)
phys_feat_weights = categorise_dataframe(non_zero_rows(phys_only_feats).T)

In [155]:
weight_df = (joined_target_weights).apply(lambda df: (df-joined_target_weights['CombinedTrust']))

In [156]:
target = 'CombinedBadMouthingPowerControl'
df = weight_df[target].reset_index()
data = df.drop(target, axis=1).values
labels = df[target].values
etr = ske.ExtraTreesRegressor(n_jobs=4, n_estimators=512)
rtr = ske.RandomForestRegressor(n_jobs=4, n_estimators=512)

In [157]:
from sklearn import linear_model, cross_validation
linr = linear_model.LinearRegression()
for reg in [etr,rtr,linr]:
    scores = cross_validation.cross_val_score(reg, data, labels, scoring='mean_squared_error', n_jobs=4)
    print scores, sp.stats.describe(scores)

[-2.647 -0.719 -1.053] DescribeResult(nobs=3, minmax=(-2.6469238688140218, -0.71899058657295978), mean=-1.4729789834756615, variance=1.0615042743276055, skewness=-0.6244115587804627, kurtosis=-1.4999999999999996)
[-2.604 -0.714 -0.952] DescribeResult(nobs=3, minmax=(-2.6041888415286918, -0.71355304141383746), mean=-1.4230939712018928, variance=1.0603982712971967, skewness=-0.6648550723268009, kurtosis=-1.5000000000000004)
[-3.028 -0.821 -1.136] DescribeResult(nobs=3, minmax=(-3.0280587211601571, -0.82062555015657546), mean=-1.6616597054254643, variance=1.4251964785709224, skewness=-0.6518939832871349, kurtosis=-1.5000000000000004)


In [151]:
from sklearn import linear_model, cross_validation
linr = linear_model.LinearRegression()
for reg in [etr,rtr,linr]:
    scores = cross_validation.cross_val_score(reg, data, labels, scoring='mean_squared_error', n_jobs=4)
    print scores, sp.stats.describe(scores)

[-2.962 -0.959 -1.107] DescribeResult(nobs=3, minmax=(-2.9621733228118092, -0.95888797451615915), mean=-1.675997737110861, variance=1.2461649799446983, skewness=-0.6931416166960097, kurtosis=-1.4999999999999998)
[-2.978 -1.074 -1.221] DescribeResult(nobs=3, minmax=(-2.9775661956890866, -1.0736179750508374), mean=-1.7572495207388548, variance=1.1222779026962888, skewness=-0.6918317244030849, kurtosis=-1.5)
[-4.384 -1.639 -1.767] DescribeResult(nobs=3, minmax=(-4.3842671501441313, -1.6393857266668597), mean=-2.5969818644062905, variance=2.3998815689673991, skewness=-0.7016876752653842, kurtosis=-1.5000000000000002)


In [148]:
sp.stats.describe(scores)

DescribeResult(nobs=3, minmax=(-2.9621733228118092, -0.95888797451615915), mean=-1.675997737110861, variance=1.2461649799446983, skewness=-0.6931416166960097, kurtosis=-1.4999999999999998)

In [None]:
metric_keys = list(weight_df.keys()[-n_metrics:])
weight_df.set_index(metric_keys+['var','t'], inplace=True)
# REMEMBER TO CHECK THIS WHEN DOING MULTIPLE RUNS (although technically it shouldn't matter....)
weight_df.drop(['observer','run'], axis=1, inplace=True)
# Sum for each run
time_summed_weights = weight_df.groupby(level=list(weight_df.index.names[:-1])).sum().unstack('var')
target_weights = time_summed_weights.xs(target,level='target', axis=1).fillna(0.0) # Nans map to no outliers

In [None]:
# Single DataFrame of all features against known good
var_weights = target_weights.apply(lambda s: s/target_weights.CombinedTrust, axis=0).dropna()
known_good_features = \
    pd.concat([feature_extractor(s.reset_index(),var) for var,s  in var_weights.iteritems()],
              keys=var_weights.keys(), names=['var','metric'])

These results handily confirm that there is a 'signature' of badmouthing as RandomFlatWalk was incorrectly configured. 

Need to: 
1. Perform multi-run tolerance analysis of metrics (i.e. turn the below into a boxplot)
2. Perform cross correlation analysis on metrics across runs/behaviours (what metrics are redundant)

In [None]:
_=known_good_features.unstack().plot(kind='bar')

In [None]:
_=known_good_features.unstack().boxplot()

In [None]:
import operator

def target_weight_feature_extractor(target_weights):
    known_good_features_d = {}
    for basekey in target_weights.keys(): # Parallelisable
        print basekey
        # Single DataFrame of all features against one behaviour
        var_weights = target_weights.apply(lambda s: s/target_weights[basekey], axis=0).dropna()
        known_good_features_d[basekey] = \
            pd.concat([feature_extractor(s.reset_index(),var) for var,s  in var_weights.iteritems()],
                      keys=var_weights.keys(), names=['var','metric'])
            
    return known_good_features_d


def dataframe_weight_filter(df, keys):
    indexes = [(df.index.get_level_values(k)==0.0) for k in keys]
    return df.loc[reduce(operator.and_,indexes)]

phys_keys = ['INDD','INHD','Speed']
comm_keys = ['ADelay','ARXP','ATXP','RXThroughput','TXThroughput','PLR']

In [None]:
#Comms Only Weights
comms_target_weights=dataframe_weight_filter(target_weights,phys_keys)
comms_target_weights.reset_index(level=phys_keys, drop=True, inplace=True)
comms_features_d = target_weight_feature_extractor(comms_target_weights)
for var,feat in comms_features_d.iteritems():
    feat.unstack().plot(kind='bar', title=var)

In [None]:
#Phys Only Weights
phys_target_weights=dataframe_weight_filter(target_weights,comm_keys)
phys_target_weights.reset_index(level=comm_keys, drop=True, inplace=True)
phys_features_d = target_weight_feature_extractor(phys_target_weights)
for var,feat in phys_features_d.iteritems():
    feat.unstack().plot(kind='bar', title=var)

In [None]:
import pandas as pd
from bounos.Analyses.Weight import summed_outliers_per_weight
observer = 'Bravo'
target = 'Alfa'
n_nodes = 6
n_metrics = 9

results_path = "/home/bolster/src/aietes/results/Malicious Behaviour Trust Comparison-2015-07-20-17-47-53"

with pd.get_store(results_path+"/outliers.bkup.h5") as store:
    target_weights_dict = {}
    for runkey in store.keys():
        print runkey
        target_weights_dict[runkey] = summed_outliers_per_weight(store.get(runkey), observer, n_metrics, target=target)

In [None]:
joined_target_weights = pd.concat(target_weights_dict, names = ['run']+target_weights_dict[runkey].index.names)
sorted_joined_target_weights = joined_target_weights.reset_index('run', drop=True).sort()
joined_feats = target_weight_feature_extractor(sorted_joined_target_weights)


alt_joined_feats= pd.concat(joined_feats, names=['base','comp','metric']).unstack('metric')[comm_keys+phys_keys]

In [None]:
for k,g in alt_joined_feats.groupby(level='base'):
    g.plot(kind='bar', title=k)

In [None]:
sorted_joined_target_weights.to_hdf('/dev/shm/target_weights_full.h5','target_weights_full')
pd.get_store('/dev/shm/target_weights.h5').flush()
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)
for var,feat in joined_feats.iteritems():
    feat.unstack().plot(kind='bar', title=var)
    break
print var
feat.unstack()

In [None]:
feat_comps = {}
for run,target_weights in joined_target_weights.groupby(level='run'): # Automatically drops the level
    target_weights = target_weights.xs(run,level='run')
    known_good_features_d = {}
    for basekey in target_weights.keys(): # Parallelisable
        print basekey
        # Single DataFrame of all features against one behaviour
        var_weights = target_weights.apply(lambda s: s/target_weights[basekey], axis=0).dropna()
        known_good_features_d[basekey] = \
            pd.concat([feature_extractor(s.reset_index(),var) for var,s  in var_weights.iteritems()],
                      keys=var_weights.keys(), names=['var','metric'])
    feat_comps[run]=pd.DataFrame.from_dict(known_good_features_d)

In [None]:
joined_feat_comp = pd.concat(feat_comps)
joined_feat_comp.unstack().swaplevel(0,1,axis=0).swaplevel(0,1,axis=1).sortlevel(axis=0).sortlevel(axis=1).reindex().groupby(level='var').describe()

In [None]:
for run,target_weights in joined_target_weights.groupby(level='run'): # Automatically drops the level
    print run
    print target_weights.xs(run,level='run').head()
    break

In [None]:
joined_feat_comp

In [None]:
np.seterr()

In [None]:
np.divide(np.ones(4),np.zeros(4))