# Required imports and packages

In [None]:
import pandas as pd
import numpy as np
import pickle
import warnings

#Scipy
import scipy.spatial.distance
from scipy.cluster.hierarchy import dendrogram

#Clustering birch
from freediscovery.cluster import birch_hierarchy_wrapper
from freediscovery.cluster import Birch,BirchSubcluster

#Sklearn
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Learners
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM

#Distance measure
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import MinMaxScaler

#Plots
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# Birch Cluster Objects
## This object is created for each cluster in the CF Tree. This stores all information regarding a cluster - 
```
1) The Original Data points
2) New Test Points
3) Predicted labels
4) Original Test labels(When Availabel)
5) Anomaly Detector object
6) Classifier Object
7) Current Performance Score
```

In [None]:
class bcluster(object):
    
    def __init__(self):
        self.parent = None
        self.parent_id = None
        self.depth = None
        self.size = None
        self.cluster_id = None
        self.data_points = []
        self.test_points = {}
        self.test_labels = {}
        self.predicted = {}
        self.centroid = None
        self.classifier = None
        self.outlier_model = None
        self.cluster_obj = None
        self.outlier_points = []
        self.score = []
        self.d1 = None
        self.d2 = None
        self.threshold = None
        self.last_retrained = 1
        self.last_certified = 0
    
    def set_parent(self,parent_node=None):
        if parent_node == None:
            self.parent = None
            self.parent_id = None
        else:
            self.parent = parent_node
            self.parent_id = parent_node.cluster_id
    
    def set_depth(self,depth):
        self.depth = depth
        
    def retrained(self,test_set):
        self.last_retrained = test_set
    
    def certify(self,test_set):
        self.last_certified = test_set
    
    def set_size(self,size):
        self.size = size
        
    def set_cluster_id(self,cluster_id):
        self.cluster_id = cluster_id
        
    def set_data_points(self,data_points):
        self.data_points = data_points
    
    def set_test_labels(self,test_labels,test_set):
        if test_set not in self.test_labels.keys():
            self.test_labels[test_set] = []
        self.test_labels[test_set] = test_labels
        
    def add_test_points(self,test_point,test_set):
        if test_set not in self.test_points.keys():
            self.test_points[test_set] = []
        self.test_points[test_set].append(test_point)
        
    def add_predicted(self,predicted,test_set):
        if test_set not in self.predicted.keys():
            self.predicted[test_set] = []
        self.predicted[test_set].append(predicted)
    
    def set_centroid(self,centroid):
        self.centroid = centroid
        
    def set_classifier(self,classifier):
        self.classifier = classifier
        
    def set_outlier_model(self,outlier_model):
        self.outlier_model = outlier_model
        
    def set_cluster_obj(self,cluster_obj):
        self.cluster_obj = cluster_obj
        
    def add_outlier_points(self,outlier_points):
        self.outlier_points.append(outlier_points)
    
    def reset_outlier_bucket(self):
        self.outlier_points = []
        
    def set_score(self,score):
        self.score = score
    
    def get_score(self):
        return self.score
        
    def add_d1(self,d1):
        self.d1 = d1
        
    def add_d2(self,d2):
        self.d2 = d2
        
    def calculate_threshold(self,outlier_threshold):
        self.threshold = max(self.d1,self.d2)*outlier_threshold
        
    def check_outlier(self,distance):
        if self.threshold < distance:
            result = True
        else:
            result = False
        return result
    
    def check_OCS_outlier(self,test_data):
        if self.outlier_model.predict([test_data]) == -1:
            result = True
        else:
            result = False
        return result

# Birch CF Tree Object
## This object holds all the functionality Testing Anomaly, Updating Clutser, Verifying Integratity or recreating the whole CF Tree.
## The Functionalities are - 
```
1) fit - Fit the Data into the Birch algorithm to create the clusters
2) set_test - Store the test data for future uses
3) get_cluster_tree - For each cluster at every level creates the bcluster objects
4) model_adder - Classification model added to each cluster by this function (Change this function to add different model)
5) update_model - Classification model is updated with new data
6) outlier_model_adder - Outlier detection model is added to each cluster (Change this function to add different model)
7) certify_model - Scores are calculated in this function

```

In [None]:
class birch(object):

    def __init__(self,threshold=0.7,branching_factor=40,n_clusters=None,outlier_threshold=0.7):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters
        self.outlier_threshold = outlier_threshold
        self.Birch_clusterer = Birch(threshold=self.threshold, branching_factor=self.branching_factor,
                                     n_clusters=self.n_clusters,compute_sample_indices=True)
        self.test_set = 0
        self.test_set_X = {}
        self.test_set_y = {}
        
    # Fit the initial data into the hierarchical cluster    
    def fit(self,data,y):
        self.data = data
        print(self.data.shape)
        self.y = y
        #self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True)
        self.Birch_clusterer.fit(self.data)
        
    # Store the test data
    def set_test(self,data,y):
        self.test_set += 1
        self.test_set_X[self.test_set] = data
        self.test_set_y[self.test_set] = y

    #Defines and builds the Cluster Feature Tree
    def get_cluster_tree(self):
        self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer)
        clusters = {}
        max_depth = 0
        for i in range(n_clusters):
            #print('cluster:', i)
            node = bcluster()
            sub_cluster = self.htree.flatten()[i]
            node.set_cluster_id(sub_cluster['cluster_id'])
            depth = sub_cluster.current_depth
            node.set_depth(depth)
            if depth > max_depth:
                max_depth = depth
            if i not in clusters.keys():
                clusters[i] = {}
            if sub_cluster.current_depth == 0:
                node.set_parent()
            else:
                node.set_parent(clusters[sub_cluster.parent['cluster_id']])
            cluster_size = sub_cluster['cluster_size']
            node.set_size(cluster_size)
            data_points = sub_cluster['document_id_accumulated']
            node.set_data_points(data_points)
            centroid = self.data.iloc[sub_cluster['document_id_accumulated'], :].mean(axis=0).values
            node.set_centroid(centroid)
            d1,d1_v = self.calculate_d1(centroid,data_points)
            d2 = self.calculate_d2(centroid,data_points,d1_v)
            node.add_d1(d1)
            node.add_d2(d2)
            node.calculate_threshold(self.outlier_threshold)
            clusters[i] = node
        return clusters,max_depth
    
    #Calculate the d1 distance(point farthest away from centroid)
    def calculate_d1(self,centroid,data_points):
        d1 = 0
        u = centroid
        d1_v = None
        for point in data_points:
            v = point
            distance = euclidean(u,v)
            if distance>d1:
                d1 = distance
                d1_v = v
        return d1,d1_v
    
    #Calculate the d2 distance(point farthest away from d1 and its distance from centroid)
    def calculate_d2(self,centroid,data_points,d1_v):
        d2_d1 = 0
        u = d1_v
        d2_v = None
        for point in data_points:
            v = point
            distance = euclidean(u,v)
            if distance>d2_d1:
                d2_d1 = distance
                d2_v = v
        d2 = euclidean(centroid,v)
        return d2
    
    # Display's the tree
    def show_clutser_tree(self):
        self.htree.display_tree()
        
    # Add classification model at each node and leaf
    def model_adder(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = DecisionTreeClassifier(criterion='entropy')
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            train_y_sub = self.y.iloc[sample_points]
            clf.fit(train_X_sub,train_y_sub)
            cluster_tree[cluster_id].set_classifier(clf)
        return cluster_tree
    
    # Update a classifictaion model when required
    def update_model(self,cluster_tree,cluster_id):
        clf = DecisionTreeClassifier(criterion='entropy')
        sample_points = cluster_tree[cluster_id].data_points
        last_retrained = cluster_tree[cluster_id].last_retrained
        train_X_sub = self.data.iloc[sample_points,:]
        train_y_sub = self.y.iloc[sample_points]
        retraining_datasets = cluster_tree[cluster_id].test_points.keys()
        for test_set in retraining_datasets:
            sample_test_points = cluster_tree[cluster_id].test_points[test_set]
            test_X_sub = self.test_set_X[test_set].iloc[sample_test_points,:]
            test_y_sub = self.test_set_y[test_set].iloc[sample_test_points]
            train_X_sub = pd.concat([train_X_sub,test_X_sub])
            train_y_sub = pd.concat([train_y_sub,test_y_sub])
        X = train_X_sub
        y = train_y_sub
        clf.fit(X,y)
        cluster_tree[cluster_id].retrained(self.test_set)
        cluster_tree[cluster_id].set_classifier(clf)
    
    # Add a outlier detection model
    def outlier_model_adder(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = OneClassSVM(kernel = 'poly',degree = 5,gamma = 'scale',nu=0.4)
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            clf.fit(train_X_sub)
            cluster_tree[cluster_id].set_outlier_model(clf)
        return cluster_tree
        
    
    # Distance Measure
    def distance(self,x,y):
        dist = (list(x[:,1]) - y)**2
        dist = np.sum(dist, axis=1)
        dist = np.sqrt(dist)
        ind = np.unravel_index(np.argmin(dist, axis=None), dist.shape)
        min_distance = dist[np.argmin(dist, axis=None)]
        return list(x[ind])[0],min_distance
    
    # Prediction Function with height based prediction with outlier detection
    def predict(self,test_X,depth,cluster_tree,do_predict=True):
        predicted = []
        cluster_centroids = []
        for cluster_id in cluster_tree:
            cluster_tree[cluster_id].reset_outlier_bucket()
            if cluster_tree[cluster_id].depth != depth:
                continue
            cluster_centroids.append([cluster_id,cluster_tree[cluster_id].centroid])
        cluster_centroids = np.array(cluster_centroids)
        for test_instance in test_X.iterrows():
            test_sample = np.array(test_instance[1].values)
            selected_cluster,min_distance = self.distance(cluster_centroids,test_sample)
            cluster_tree[selected_cluster].add_test_points(test_instance[0],self.test_set)
            if cluster_tree[selected_cluster].check_OCS_outlier(test_sample):
                cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            if do_predict:
                _predicted_label = cluster_tree[selected_cluster].classifier.predict([test_sample])
                cluster_tree[selected_cluster].add_predicted(_predicted_label[0],self.test_set)
                predicted.append(_predicted_label[0])
        return predicted
    
    # Model certification creator
    def certify_model(self,cluster_tree,test_y):
        for cluster_id in cluster_tree:
            if len(cluster_tree[cluster_id].test_points.keys()) == 0:
                continue
            if self.test_set not in cluster_tree[cluster_id].test_points.keys():
                continue
            cluster_tree[cluster_id].set_test_labels(test_y[cluster_tree[cluster_id].test_points[self.test_set]].values,self.test_set)
            precision = metrics.precision_score(cluster_tree[cluster_id].test_labels[self.test_set], 
                                                cluster_tree[cluster_id].predicted[self.test_set],average='weighted')
            recall = metrics.recall_score(cluster_tree[cluster_id].test_labels[self.test_set], 
                                          cluster_tree[cluster_id].predicted[self.test_set],average='weighted')
            f1_Score = metrics.f1_score(cluster_tree[cluster_id].test_labels[self.test_set], 
                                        cluster_tree[cluster_id].predicted[self.test_set],average='weighted')
            score = {'precision': precision,'recall': recall,'f1_Score': f1_Score}
            cluster_tree[cluster_id].set_score(score)
            
    
    # Check each model for performance validation with the pre selected threshold
    def check_model(self,cluster_tree,threshold = 0.7):
        score = {}
        for cluster_id in cluster_tree:
            if len(cluster_tree[cluster_id].test_points) == 0:
                continue
            score[cluster_id] =  cluster_tree[cluster_id].get_score()
            if score[cluster_id]['f1_Score'] < threshold:
                print('retreining',score[cluster_id]['f1_Score'])
                self.update_model(cluster_tree,cluster_id)
                
    # Rebuilding all models if required
    def rebuild_models(self,cluster):
        train_X_sub = self.data
        train_y_sub = self.y
        for test_set in range(1,self.test_set+1):
            test_X_sub = self.test_set_X[test_set]
            test_y_sub = self.test_set_y[test_set]
            train_X_sub = pd.concat([train_X_sub,test_X_sub])
            train_y_sub = pd.concat([train_y_sub,test_y_sub])
        X = train_X_sub.reset_index(drop=True)
        y = train_y_sub.reset_index(drop=True)
        self.fit(X,y)
        self.test_set = 0
        self.test_set_X = {}
        self.test_set_y = {}
        cluster_tree,max_depth = cluster.get_cluster_tree()
        print(cluster)
        cluster_tree = cluster.model_adder(cluster_tree)
        cluster_tree = cluster.outlier_model_adder(cluster_tree)
        return cluster_tree,max_depth
                

# Some Utility Function
## The functions are - 

```
1) load_data - Data is divided into dependent(X) and Independent(y) columns based in "target" value provided
2) get_data - Divide the data into dependent(X) and Independent(y) and relabel then as normal and anomaly
3) cluster_driver - Base function to create the hierarchical model. It returns all the clusters and cluster tree objects

```

In [None]:
def load_data(path,target):
    df = pd.read_csv(path)
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    return df,X,y

def get_data(file_path,target='defects',normal_class='normal'):
    train_df, train_X, train_y = load_data(file_path,target)
    y_train = []
    for instance in train_y.values:
        if instance == normal_class:
            y_train.append(1)
        else:
            y_train.append(-1)
    y_train = pd.Series(y_train)
    train_df.defects.unique()
    return train_df,train_X,y_train

# Cluster Driver
def cluster_driver(file,print_tree = False):
    train_df, train_X, train_y = get_data(file)
    cluster = birch(branching_factor=20)
    cluster.fit(train_X,train_y)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    cluster_tree = cluster.model_adder(cluster_tree)
    cluster_tree = cluster.outlier_model_adder(cluster_tree)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth

# Setting up the experiment
## Here we have divided the dataset into 2 groups of 5 subset, each subset cointaining different types of anomilies -
```
1) Normal_Data - Contains normal class along with one type of known anomaly.
2) 5_anomaly - Contains new anomaly for 1st 5 subsets and then it repeats.
3) Mixed_data - Data mixed with normal class and newly seen anomalies for further usases.
```

In [None]:
# Experimental Setups
total_df = pd.DataFrame()
for i in range(10):
    predicted = {}
    # Load Normal File
    file_normal = 'Data/NSL-KDD/modified/Train/Normal_Data/N' + str(i) + '.csv'
    # Load Anomaly File
    file_anomaly = 'Data/NSL-KDD/modified/Train/5_anomaly/A' + str(i) + '.csv'
    # Load Mixed File
    file_mixed = 'Data/NSL-KDD/modified/Train/Mixed_data/M' + str(i) + '.csv'
    # Build the whole tree for the 1st time with normal class and known anomaly
    if i == 0:
        normal_df,_,_ = load_data(file_normal,'defects')
        cluster,cluster_tree,max_depth = cluster_driver(file_normal)
    # Sending out new anomalies to be detected without any rebuilding
    if i < 5:
        normal_df,_,_ = load_data(file_normal,'defects')
        anomaly_df,_,_ = load_data(file_anomaly,'defects')
        total_df = pd.concat([total_df,normal_df])
        total_df = pd.concat([total_df,anomaly_df])
        total_df.to_csv('Data/NSL-KDD/modified/Train/mixed_Data/M' + str(i+5) + '.csv',index=False)
    # Rebuilding models and with past data and trying to find the same anomalies
    test_df,test_X,test_y = get_data(file_anomaly)
    cluster.set_test(test_X,test_y)
    for depth in range(max_depth):
        predicted[depth] = cluster.predict(test_X,depth,cluster_tree,True)
    cluster.certify_model(cluster_tree,test_y)
    if i >= 5:
        cluster.check_model(cluster_tree,threshold=0.8)
    print(metrics.classification_report(test_y, predicted[1]))
    total = 0
    j = 0
    for i in cluster_tree:
        if len(cluster_tree[i].test_points) == 0:
            continue
        j += 1
        total += len(cluster_tree[i].outlier_points)
    print(j,total)


# Rebuilding the Whole Tree if required

In [None]:
cluster_tree,max_depth = cluster.rebuild_models(cluster)