In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from graphviz import Digraph
import scipy.spatial.distance
from scipy.cluster.hierarchy import dendrogram
#Clustering birch
from freediscovery.cluster import birch_hierarchy_wrapper
from freediscovery.cluster import Birch,BirchSubcluster
#Sklearn
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn import metrics
#Learners
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
#Distance measure
from scipy.spatial.distance import euclidean

import warnings

import matplotlib.pyplot as plt

In [32]:
warnings.filterwarnings("ignore")

In [76]:
class bcluster(object):
    
    def __init__(self):
        self.parent = None
        self.parent_id = None
        self.depth = None
        self.size = None
        self.cluster_id = None
        self.data_points = []
        self.test_points = []
        self.test_labels = []
        self.predicted = []
        self.centroid = None
        self.classifier = None
        self.outlier_model = None
        self.cluster_obj = None
        self.outlier_points = []
        self.score = []
        self.d1 = None
        self.d2 = None
        self.threshold = None
    
    def set_parent(self,parent_node=None):
        if parent_node == None:
            self.parent = None
            self.parent_id = None
        else:
            self.parent = parent_node
            self.parent_id = parent_node.cluster_id
    
    def set_depth(self,depth):
        self.depth = depth
    
    def set_size(self,size):
        self.size = size
        
    def set_cluster_id(self,cluster_id):
        self.cluster_id = cluster_id
        
    def set_data_points(self,data_points):
        self.data_points = data_points
    
    def set_test_labels(self,test_labels):
        self.test_labels = test_labels
        
    def add_test_points(self,test_point):
        self.test_points.append(test_point)
        
    def add_predicted(self,predicted):
        self.predicted.append(predicted)
    
    def set_centroid(self,centroid):
        self.centroid = centroid
        
    def set_classifier(self,classifier):
        self.classifier = classifier
        
    def set_outlier_model(self,outlier_model):
        self.outlier_model = outlier_model
        
    def set_cluster_obj(self,cluster_obj):
        self.cluster_obj = cluster_obj
        
    def add_outlier_points(self,outlier_points):
        self.outlier_points.append(outlier_points)
        
    def set_score(self,score):
        self.score = score
        
    def add_d1(self,d1):
        self.d1 = d1
        
    def add_d2(self,d2):
        self.d2 = d2
        
    def calculate_threshold(self,outlier_threshold):
        self.threshold = max(self.d1,self.d2)*outlier_threshold
        
    def check_outlier(self,distance):
        if self.threshold < distance:
            result = True
        else:
            result = False
        return result
    
    def check_OCS_outlier(self,test_data):
        if self.outlier_model.predict([test_data]) == -1:
            result = True
        else:
            result = False
        return result

In [95]:
class birch(object):

    def __init__(self,threshold=0.7,branching_factor=40,n_clusters=None,outlier_threshold=0.7):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters
        self.outlier_threshold = outlier_threshold
        self.Birch_clusterer = Birch(threshold=self.threshold, branching_factor=self.branching_factor,
                                     n_clusters=self.n_clusters,compute_sample_indices=True)
    # Fitting the model with train_X
    def fit(self,data,y):
        self.data = data
        self.y = y
        #self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True)
        self.Birch_clusterer.fit(self.data)

    #Defines and builds the Cluster Feature Tree
    def get_cluster_tree(self):
        self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer)
        clusters = {}
        max_depth = 0
        for i in range(n_clusters):
            #print('cluster:', i)
            node = bcluster()
            sub_cluster = self.htree.flatten()[i]
            node.set_cluster_id(sub_cluster['cluster_id'])
            depth = sub_cluster.current_depth
            node.set_depth(depth)
            if depth > max_depth:
                max_depth = depth
            if i not in clusters.keys():
                clusters[i] = {}
            if sub_cluster.current_depth == 0:
                node.set_parent()
            else:
                node.set_parent(clusters[sub_cluster.parent['cluster_id']])
            cluster_size = sub_cluster['cluster_size']
            node.set_size(cluster_size)
            data_points = sub_cluster['document_id_accumulated']
            node.set_data_points(data_points)
            centroid = self.data.iloc[sub_cluster['document_id_accumulated'], :].mean(axis=0).values
            node.set_centroid(centroid)
            d1,d1_v = self.calculate_d1(centroid,data_points)
            d2 = self.calculate_d2(centroid,data_points,d1_v)
            node.add_d1(d1)
            node.add_d2(d2)
            node.calculate_threshold(self.outlier_threshold)
            clusters[i] = node
        return clusters,max_depth
    
    #Calculate the d1 distance(point farthest away from centroid)
    def calculate_d1(self,centroid,data_points):
        d1 = 0
        u = centroid
        d1_v = None
        for point in data_points:
            v = point
            distance = euclidean(u,v)
            if distance>d1:
                d1 = distance
                d1_v = v
        return d1,d1_v
    
    #Calculate the d2 distance(point farthest away from d1 and its distance from centroid)
    def calculate_d2(self,centroid,data_points,d1_v):
        d2_d1 = 0
        u = d1_v
        d2_v = None
        for point in data_points:
            v = point
            distance = euclidean(u,v)
            if distance>d2_d1:
                d2_d1 = distance
                d2_v = v
        d2 = euclidean(centroid,v)
        return d2
    
    # Display's the tree
    def show_clutser_tree(self):
        self.htree.display_tree()
        
    # Add classification model at each node and leaf
    def model_adder(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = DecisionTreeClassifier(criterion='entropy')
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            train_y_sub = self.y.iloc[sample_points]
            clf.fit(train_X_sub,train_y_sub)
            cluster_tree[cluster_id].set_classifier(clf)
        return cluster_tree
    
    def outlier_model_adder(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = OneClassSVM(nu=0.1)
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            clf.fit(train_X_sub)
            cluster_tree[cluster_id].set_outlier_model(clf)
        return cluster_tree
        
    # Prediction Function with height based prediction with outlier detection
    def predict(self,test_X,depth,do_predict=True):
        predicted = []
        for test_instance in test_X.iterrows():
            test_sample = test_instance[1].values
            min_distance = float('inf')
            selected_cluster = None
            for cluster_id in cluster_tree:
                if cluster_tree[cluster_id].depth != depth:
                    continue
                u = cluster_tree[cluster_id].centroid
                v = np.asarray(test_sample,dtype='float64')
                distance = euclidean(u,v)
                if distance < min_distance:
                    min_distance = distance
                    selected_cluster = cluster_id
            cluster_tree[selected_cluster].add_test_points(test_instance[0])
            # Outlier identifier
            if cluster_tree[selected_cluster].check_outlier(min_distance):
                cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            if do_predict:
                _predicted_label = cluster_tree[selected_cluster].classifier.predict([test_sample])
                cluster_tree[selected_cluster].add_predicted(_predicted_label)
                predicted.append(_predicted_label)
        return predicted
    
    def distance(self,x,y):
        dist = (list(x[:,1]) - y)**2
        dist = np.sum(dist, axis=1)
        dist = np.sqrt(dist)
        ind = np.unravel_index(np.argmin(dist, axis=None), dist.shape)
        min_distance = dist[np.argmin(dist, axis=None)]
        return list(x[ind])[0],min_distance
    
    # New Predict
    def predict_new(self,test_X,depth,do_predict=True):
        predicted = []
        cluster_centroids = []
        for cluster_id in cluster_tree:
            if cluster_tree[cluster_id].depth != depth:
                continue
            cluster_centroids.append([cluster_id,cluster_tree[cluster_id].centroid])
        cluster_centroids = np.array(cluster_centroids)
        for test_instance in test_X.iterrows():
            test_sample = np.array(test_instance[1].values)
            selected_cluster,min_distance = self.distance(cluster_centroids,test_sample)
            cluster_tree[selected_cluster].add_test_points(test_instance[0])
            # Outlier identifier
            #if cluster_tree[selected_cluster].check_outlier(min_distance):
            #    cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            if cluster_tree[selected_cluster].check_OCS_outlier(test_sample):
                cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            if do_predict:
                _predicted_label = cluster_tree[selected_cluster].classifier.predict([test_sample])
                cluster_tree[selected_cluster].add_predicted(_predicted_label)
                predicted.append(_predicted_label)
        return predicted
    
    # Model certification creator
    def certify_model(self,cluster_tree,test_y):
        for cluster_id in cluster_tree:
            if len(cluster_tree[cluster_id].test_points) == 0:
                continue
            cluster_tree[cluster_id].set_test_labels(test_y[cluster_tree[cluster_id].test_points].values)
            precision = metrics.precision_score(cluster_tree[cluster_id].test_labels, 
                                                cluster_tree[cluster_id].predicted,average='weighted')
            recall = metrics.recall_score(cluster_tree[cluster_id].test_labels, 
                                          cluster_tree[cluster_id].predicted,average='weighted')
            f1_Score = metrics.f1_score(cluster_tree[cluster_id].test_labels, 
                                        cluster_tree[cluster_id].predicted,average='weighted')
            score = {'precision': precision,'recall': recall,'f1_Score': f1_Score}
            cluster_tree[cluster_id].set_score(score)

In [116]:
def load_data(path,target):
    df = pd.read_csv(path)
    print(df.shape)
    #df_change = df[['protocol_type', 'service','flag']]
    #df = df.drop(labels = ['protocol_type', 'service','flag'], axis = 1)
    #df_change = pd.get_dummies(df_change, prefix=['protocol_type', 'service','flag'])
    #df = df.merge(df_change,left_index=True, right_index=True)
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.10, random_state=42)
    return train_X, test_X, train_y, test_y

def load_mutated_data(path,target):
    df = pd.read_csv(path)
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    return X,y

In [97]:
# Cluster Driver
def cluster_driver(file,print_tree = True):
    train_X, test_X, train_y, test_y = load_data(file,'defects')
    cluster = birch(branching_factor=20)
    cluster.fit(train_X,train_y)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    #cluster_tree = cluster.model_adder(cluster_tree)
    cluster_tree = cluster.outlier_model_adder(cluster_tree)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth,test_X

In [98]:
# getting the cluster tree
file = 'Data/KDD/modified/train.csv'

cluster,cluster_tree,max_depth,test_X_1 = cluster_driver(file)

(97278, 11)
[cluster_id=0] N_children: 11 N_samples: 9727
> [cluster_id=1] N_children: 8 N_samples: 123
> > [cluster_id=2] N_children: 0 N_samples: 13
> > [cluster_id=3] N_children: 0 N_samples: 18
> > [cluster_id=4] N_children: 0 N_samples: 20
> > [cluster_id=5] N_children: 0 N_samples: 19
> > [cluster_id=6] N_children: 0 N_samples: 16
> > [cluster_id=7] N_children: 0 N_samples: 14
> > [cluster_id=8] N_children: 0 N_samples: 18
> > [cluster_id=9] N_children: 0 N_samples: 5
> [cluster_id=10] N_children: 0 N_samples: 13
> [cluster_id=11] N_children: 3 N_samples: 652
> > [cluster_id=12] N_children: 15 N_samples: 201
> > > [cluster_id=13] N_children: 0 N_samples: 15
> > > [cluster_id=14] N_children: 0 N_samples: 7
> > > [cluster_id=15] N_children: 0 N_samples: 18
> > > [cluster_id=16] N_children: 0 N_samples: 10
> > > [cluster_id=17] N_children: 0 N_samples: 13
> > > [cluster_id=18] N_children: 0 N_samples: 20
> > > [cluster_id=19] N_children: 0 N_samples: 13
> > > [cluster_id=20] N_child

In [74]:
# Base performance score
#file = 'Data/KDD/modified/train.csv'
#clf = DecisionTreeClassifier(criterion='gini')
#train_X, test_X, train_y, test_y = load_data(file,'defects')
#clf.fit(train_X, train_y)
#predicted = clf.predict(test_X)
#print(metrics.classification_report(test_y, predicted))

In [122]:
file = 'Data/KDD/modified/test.csv'
test_X,test_y = load_mutated_data(file,'defects')
#train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.10, random_state=42)

In [99]:
# Birch classifier score(mention depth)
print(max_depth)
depth = max_depth
predicted = cluster.predict_new(test_X_1,depth,False)
#print(metrics.classification_report(test_y, predicted,False))
#cluster.certify_model(cluster_tree,test_y)

3


In [94]:
test_X_1.shape

(87551, 10)

In [None]:
# Get all the scores
for i in cluster_tree:
    if len(cluster_tree[i].test_points) == 0:
        continue
    print(cluster_tree[i].score)

In [100]:
# Check for outlies detected
total = 0
j = 0
for i in cluster_tree:
    if len(cluster_tree[i].test_points) == 0:
        continue
    print("Percentage Identified",len(cluster_tree[i].data_points),len(cluster_tree[i].test_points),len(cluster_tree[i].outlier_points)/len(cluster_tree[i].test_points))
    j += 1
    total += len(cluster_tree[i].outlier_points)
print(j,total)

Percentage Identified 15 125 1.0
Percentage Identified 7 74 1.0
Percentage Identified 18 89 1.0
Percentage Identified 10 64 1.0
Percentage Identified 13 53 1.0
Percentage Identified 20 92 1.0
Percentage Identified 13 101 1.0
Percentage Identified 12 63 1.0
Percentage Identified 13 171 1.0
Percentage Identified 9 21 1.0
Percentage Identified 7 146 1.0
Percentage Identified 16 90 1.0
Percentage Identified 14 118 1.0
Percentage Identified 18 174 1.0
Percentage Identified 16 152 1.0
Percentage Identified 15 145 0.993103448275862
Percentage Identified 13 74 1.0
Percentage Identified 16 90 1.0
Percentage Identified 17 217 1.0
Percentage Identified 12 95 1.0
Percentage Identified 17 94 1.0
Percentage Identified 19 161 0.9937888198757764
Percentage Identified 19 176 1.0
Percentage Identified 14 102 0.9901960784313726
Percentage Identified 16 145 1.0
Percentage Identified 9 94 1.0
Percentage Identified 20 140 1.0
Percentage Identified 3 153 1.0
Percentage Identified 17 141 1.0
Percentage Identi

Percentage Identified 12 78 1.0
Percentage Identified 155 1198 0.2020033388981636
Percentage Identified 4 21 1.0
Percentage Identified 20 162 0.49382716049382713
Percentage Identified 14 137 0.5912408759124088
Percentage Identified 112 1214 0.2701812191103789
Percentage Identified 3 97 1.0
Percentage Identified 9 49 0.9183673469387755
Percentage Identified 85 697 0.20516499282639886
Percentage Identified 14 144 0.6736111111111112
Percentage Identified 70 759 0.10935441370223979
Percentage Identified 41 346 0.4190751445086705
Percentage Identified 1 39 1.0
Percentage Identified 17 134 0.8880597014925373
Percentage Identified 1 67 1.0
Percentage Identified 18 103 0.5631067961165048
Percentage Identified 11 98 0.5816326530612245
Percentage Identified 5 25 1.0
Percentage Identified 3 75 1.0
Percentage Identified 21 267 0.3595505617977528
Percentage Identified 19 112 1.0
Percentage Identified 19 79 1.0
Percentage Identified 11 22 1.0
Percentage Identified 17 125 1.0
Percentage Identified 16

In [None]:
# Train and test on multiple datasets with mutation
num_outliers = []
for i in range(20):
    s_file = 'data/JDT.csv'
    cluster,cluster_tree,max_depth = cluster_driver(s_file)
    t_file = 'data/JDT_' + str(i) +'.csv'
    test_X,test_y = load_mutated_data(t_file,'defects')
    depth = max_depth
    predicted = cluster.predict(test_X,depth)
    cluster.certify_model(cluster_tree,test_y)
    total = 0
    for i in cluster_tree:
        if len(cluster_tree[i].test_points) == 0:
            continue
        total += len(cluster_tree[i].outlier_points)
    num_outliers.append(total/test_X.shape[0])

In [None]:
# Plot outliers detected at every iteration
plt.plot(num_outliers)
plt.title('Context Shift outlier detection plot for:' + s_file)
plt.xlabel('Number of Outliers')
plt.ylabel('Repeats')
plt.show()

In [None]:
x = np.array([[1,np.array([18,2,3,4])],[2,np.array([2,3,4,5])],[3,np.array([3,4,5,6])],[4,np.array([4,5,6,7])]])
y = np.array([1,1,1,1])

In [None]:
def distance(x,y):
    dist = (list(x[:,1]) - y)**2
    dist = np.sum(dist, axis=1)
    dist = np.sqrt(dist)
    print(dist)
    ind = np.unravel_index(np.argmin(dist, axis=None), dist.shape)
    min_distance = dist[np.argmin(dist, axis=None)]
    return ind[0],min_distance

In [None]:
x = np.random.rand(4500,118)
z = np.random.rand(4500,1)
x = np.array(list(zip(z,x)))
y = np.random.rand(1,118)

In [None]:
ind,dis = distance(x,y)

In [None]:
x[ind]

In [None]:
dis

In [None]:
x

In [138]:
model =  OneClassSVM(nu=0.8)

In [139]:
file = 'Data/KDD/modified/train.csv'
train_X, test_X, train_y, test_y = load_data(file,'defects')
model.fit(train_X)

(247010, 11)


OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, nu=0.8, random_state=None, shrinking=True,
      tol=0.001, verbose=False)

In [140]:
predicted = model.predict(test_X)

In [141]:
import collections, numpy
a = collections.Counter(predicted)

In [142]:
a

Counter({-1: 24701})