In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from graphviz import Digraph
import scipy.spatial.distance
from scipy.cluster.hierarchy import dendrogram
#Clustering birch
from freediscovery.cluster import birch_hierarchy_wrapper
from freediscovery.cluster import Birch,BirchSubcluster
#Sklearn
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn import metrics
#Learners
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
#Distance measure
from scipy.spatial.distance import euclidean

import warnings

import matplotlib.pyplot as plt

import pickle

In [2]:
warnings.filterwarnings("ignore")

In [3]:
class bcluster(object):
    
    def __init__(self):
        self.parent = None
        self.parent_id = None
        self.depth = None
        self.size = None
        self.cluster_id = None
        self.data_points = []
        self.test_points = []
        self.test_labels = []
        self.predicted = []
        self.centroid = None
        self.classifier = None
        self.outlier_model = None
        self.cluster_obj = None
        self.outlier_points = []
        self.score = []
        self.d1 = None
        self.d2 = None
        self.threshold = None
    
    def set_parent(self,parent_node=None):
        if parent_node == None:
            self.parent = None
            self.parent_id = None
        else:
            self.parent = parent_node
            self.parent_id = parent_node.cluster_id
    
    def set_depth(self,depth):
        self.depth = depth
    
    def set_size(self,size):
        self.size = size
        
    def set_cluster_id(self,cluster_id):
        self.cluster_id = cluster_id
        
    def set_data_points(self,data_points):
        self.data_points = data_points
    
    def set_test_labels(self,test_labels):
        self.test_labels = test_labels
        
    def add_test_points(self,test_point):
        self.test_points.append(test_point)
        
    def add_predicted(self,predicted):
        self.predicted.append(predicted)
    
    def set_centroid(self,centroid):
        self.centroid = centroid
        
    def set_classifier(self,classifier):
        self.classifier = classifier
        
    def set_outlier_model(self,outlier_model):
        self.outlier_model = outlier_model
        
    def set_cluster_obj(self,cluster_obj):
        self.cluster_obj = cluster_obj
        
    def add_outlier_points(self,outlier_points):
        self.outlier_points.append(outlier_points)
    
    def reset_outlier_bucket(self):
        self.outlier_points = []
        
    def set_score(self,score):
        self.score = score
        
    def add_d1(self,d1):
        self.d1 = d1
        
    def add_d2(self,d2):
        self.d2 = d2
        
    def calculate_threshold(self,outlier_threshold):
        self.threshold = max(self.d1,self.d2)*outlier_threshold
        
    def check_outlier(self,distance):
        if self.threshold < distance:
            result = True
        else:
            result = False
        return result
    
    def check_OCS_outlier(self,test_data):
        if self.outlier_model.predict([test_data]) == -1:
            result = True
        else:
            result = False
        return result

In [4]:
class birch(object):

    def __init__(self,threshold=0.7,branching_factor=40,n_clusters=None,outlier_threshold=0.7):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters
        self.outlier_threshold = outlier_threshold
        self.Birch_clusterer = Birch(threshold=self.threshold, branching_factor=self.branching_factor,
                                     n_clusters=self.n_clusters,compute_sample_indices=True)
    # Fitting the model with train_X
    def fit(self,data,y):
        self.data = data
        self.y = y
        #self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True)
        self.Birch_clusterer.fit(self.data)

    #Defines and builds the Cluster Feature Tree
    def get_cluster_tree(self):
        self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer)
        clusters = {}
        max_depth = 0
        for i in range(n_clusters):
            #print('cluster:', i)
            node = bcluster()
            sub_cluster = self.htree.flatten()[i]
            node.set_cluster_id(sub_cluster['cluster_id'])
            depth = sub_cluster.current_depth
            node.set_depth(depth)
            if depth > max_depth:
                max_depth = depth
            if i not in clusters.keys():
                clusters[i] = {}
            if sub_cluster.current_depth == 0:
                node.set_parent()
            else:
                node.set_parent(clusters[sub_cluster.parent['cluster_id']])
            cluster_size = sub_cluster['cluster_size']
            node.set_size(cluster_size)
            data_points = sub_cluster['document_id_accumulated']
            node.set_data_points(data_points)
            centroid = self.data.iloc[sub_cluster['document_id_accumulated'], :].mean(axis=0).values
            node.set_centroid(centroid)
            d1,d1_v = self.calculate_d1(centroid,data_points)
            d2 = self.calculate_d2(centroid,data_points,d1_v)
            node.add_d1(d1)
            node.add_d2(d2)
            node.calculate_threshold(self.outlier_threshold)
            clusters[i] = node
        return clusters,max_depth
    
    #Calculate the d1 distance(point farthest away from centroid)
    def calculate_d1(self,centroid,data_points):
        d1 = 0
        u = centroid
        d1_v = None
        for point in data_points:
            v = point
            distance = euclidean(u,v)
            if distance>d1:
                d1 = distance
                d1_v = v
        return d1,d1_v
    
    #Calculate the d2 distance(point farthest away from d1 and its distance from centroid)
    def calculate_d2(self,centroid,data_points,d1_v):
        d2_d1 = 0
        u = d1_v
        d2_v = None
        for point in data_points:
            v = point
            distance = euclidean(u,v)
            if distance>d2_d1:
                d2_d1 = distance
                d2_v = v
        d2 = euclidean(centroid,v)
        return d2
    
    # Display's the tree
    def show_clutser_tree(self):
        self.htree.display_tree()
        
    # Add classification model at each node and leaf
    def model_adder(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = DecisionTreeClassifier(criterion='entropy')
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            train_y_sub = self.y.iloc[sample_points]
            clf.fit(train_X_sub,train_y_sub)
            cluster_tree[cluster_id].set_classifier(clf)
        return cluster_tree
    
    def outlier_model_adder(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = OneClassSVM(kernel = 'poly',degree = 5,gamma = 'scale',nu=0.4)
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            clf.fit(train_X_sub)
            cluster_tree[cluster_id].set_outlier_model(clf)
        return cluster_tree
        
    # Prediction Function with height based prediction with outlier detection
    def predict(self,test_X,depth,do_predict=True):
        predicted = []
        for test_instance in test_X.iterrows():
            test_sample = test_instance[1].values
            min_distance = float('inf')
            selected_cluster = None
            for cluster_id in cluster_tree:
                if cluster_tree[cluster_id].depth != depth:
                    continue
                u = cluster_tree[cluster_id].centroid
                v = np.asarray(test_sample,dtype='float64')
                distance = euclidean(u,v)
                if distance < min_distance:
                    min_distance = distance
                    selected_cluster = cluster_id
            cluster_tree[selected_cluster].add_test_points(test_instance[0])
            # Outlier identifier
            if cluster_tree[selected_cluster].check_outlier(min_distance):
                cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            if do_predict:
                _predicted_label = cluster_tree[selected_cluster].classifier.predict([test_sample])
                cluster_tree[selected_cluster].add_predicted(_predicted_label)
                predicted.append(_predicted_label)
        return predicted
    
    def distance(self,x,y):
        dist = (list(x[:,1]) - y)**2
        dist = np.sum(dist, axis=1)
        dist = np.sqrt(dist)
        ind = np.unravel_index(np.argmin(dist, axis=None), dist.shape)
        min_distance = dist[np.argmin(dist, axis=None)]
        return list(x[ind])[0],min_distance
    
    # New Predict
    def predict_new(self,test_X,depth,do_predict=True):
        predicted = []
        cluster_centroids = []
        for cluster_id in cluster_tree:
            cluster_tree[cluster_id].reset_outlier_bucket()
            if cluster_tree[cluster_id].depth != depth:
                continue
            cluster_centroids.append([cluster_id,cluster_tree[cluster_id].centroid])
        cluster_centroids = np.array(cluster_centroids)
        for test_instance in test_X.iterrows():
            test_sample = np.array(test_instance[1].values)
            selected_cluster,min_distance = self.distance(cluster_centroids,test_sample)
            cluster_tree[selected_cluster].add_test_points(test_instance[0])
            # Outlier identifier
            #if cluster_tree[selected_cluster].check_outlier(min_distance):
            #    cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            if cluster_tree[selected_cluster].check_OCS_outlier(test_sample):
                cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            if do_predict:
                _predicted_label = cluster_tree[selected_cluster].classifier.predict([test_sample])
                cluster_tree[selected_cluster].add_predicted(_predicted_label)
                predicted.append(_predicted_label)
        return predicted
    
    # Model certification creator
    def certify_model(self,cluster_tree,test_y):
        for cluster_id in cluster_tree:
            if len(cluster_tree[cluster_id].test_points) == 0:
                continue
            cluster_tree[cluster_id].set_test_labels(test_y[cluster_tree[cluster_id].test_points].values)
            precision = metrics.precision_score(cluster_tree[cluster_id].test_labels, 
                                                cluster_tree[cluster_id].predicted,average='weighted')
            recall = metrics.recall_score(cluster_tree[cluster_id].test_labels, 
                                          cluster_tree[cluster_id].predicted,average='weighted')
            f1_Score = metrics.f1_score(cluster_tree[cluster_id].test_labels, 
                                        cluster_tree[cluster_id].predicted,average='weighted')
            score = {'precision': precision,'recall': recall,'f1_Score': f1_Score}
            cluster_tree[cluster_id].set_score(score)

In [5]:
def load_data(path,target):
    df = pd.read_csv(path)
    print(df.shape)
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.10, random_state=42)
    return train_X, test_X, train_y, test_y

def load_mutated_data(path,target):
    df = pd.read_csv(path)
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    print(X.shape)
    return X,y

In [6]:
# Cluster Driver
def cluster_driver(file,print_tree = True):
    train_X, test_X, train_y, test_y = load_data(file,'defects')
    cluster = birch(branching_factor=20)
    cluster.fit(train_X,train_y)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    #cluster_tree = cluster.model_adder(cluster_tree)
    cluster_tree = cluster.outlier_model_adder(cluster_tree)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth,test_X

In [None]:
# getting the cluster tree
file = 'Data/NSL-KDD/modified/train.csv'
cluster,cluster_tree,max_depth,test_X_1 = cluster_driver(file)

(21731, 21)


In [None]:
# Saving Model
with open('Data/NSL-KDD/modified/birch_model.h5', 'wb') as config_dictionary_file:
    pickle.dump(cluster_tree, config_dictionary_file)

In [None]:
# Loading Test Data
file = 'Data/NSL-KDD/modified/test.csv'
test_X,test_y = load_mutated_data(file,'defects')

In [None]:
# Birch classifier score(mention depth)
print(max_depth)
depth = 0
predicted = cluster.predict_new(test_X,depth,False)

In [None]:
test_X.shape

In [None]:
# Get all the scores
for i in cluster_tree:
    if len(cluster_tree[i].test_points) == 0:
        continue
    print(cluster_tree[i].score)

In [None]:
# Check for outlies detected
total = 0
j = 0
for i in cluster_tree:
    if len(cluster_tree[i].test_points) == 0:
        continue
    print("Percentage Identified",len(cluster_tree[i].data_points),len(cluster_tree[i].test_points),len(cluster_tree[i].outlier_points)/len(cluster_tree[i].test_points))
    j += 1
    total += len(cluster_tree[i].outlier_points)
print(j,total)