In [1]:
import pandas as pd
import numpy as np
import collections
import seaborn as sns
import matplotlib.pyplot as plt
from graphviz import Digraph
import scipy.spatial.distance
from scipy.cluster.hierarchy import dendrogram
#Clustering birch
from freediscovery.cluster import birch_hierarchy_wrapper
from freediscovery.cluster import Birch,BirchSubcluster
#Sklearn
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn import metrics
#Learners
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
#Distance measure
from scipy.spatial.distance import euclidean
from scipy import linalg, sparse

import warnings

import matplotlib.pyplot as plt

import pickle

In [2]:
warnings.filterwarnings("ignore")

In [3]:
def row_norms(X, squared=False):
    if sparse.issparse(X):
        if not isinstance(X, sparse.csr_matrix):
            X = sparse.csr_matrix(X)
        norms = csr_row_norms(X)
    else:
        norms = np.einsum('ij,ij->i', X, X)

    if not squared:
        np.sqrt(norms, norms)
    return norms

In [67]:
class bcluster(object):
    
    def __init__(self):
        self.parent = None
        self.parent_id = None
        self.depth = None
        self.size = None
        self.cluster_id = None
        self.data_points = []
        self.data_labels = []
        self.test_points = []
        self.test_labels = []
        self.predicted = []
        self.centroid = None
        self.classifier = None
        self.outlier_model = None
        self.cluster_obj = None
        self.outlier_points = []
        self.outlier_points_prediction = []
        self.score = []
        self.d1 = None
        self.d2 = None
        self.threshold = None
        self.retrain = False
    
    def set_parent(self,parent_node=None):
        if parent_node == None:
            self.parent = None
            self.parent_id = None
        else:
            self.parent = parent_node
            self.parent_id = parent_node.cluster_id
    
    def set_depth(self,depth):
        self.depth = depth
    
    def set_size(self,size):
        self.size = size
        
    def set_cluster_id(self,cluster_id):
        self.cluster_id = cluster_id
        
    def set_data_points(self,data_points):
        self.data_points = data_points
        
    def set_data_labels(self,data_labels):
        self.data_labels = data_labels
    
    def set_test_labels(self,test_labels):
        self.test_labels = test_labels
        
    def add_test_lables(self,test_labels):
        self.test_labels.append(test_labels)
        
    def reset_test_lables_bucket(self):
        self.test_labels = []
        
    def add_test_points(self,test_point):
        self.test_points.append(test_point)
        
    def reset_test_points_bucket(self):
        self.test_points = []
        
    def add_outlier_points_prediction(self,outlier_points_prediction):
        self.outlier_points_prediction.append(outlier_points_prediction)
        
    def reset_outlier_points_prediction_bucket(self):
        self.outlier_points_prediction = []
    
    def add_predicted(self,predicted):
        self.predicted.append(predicted)
    
    def set_centroid(self,centroid):
        self.centroid = centroid
        
    def set_classifier(self,classifier):
        self.classifier = classifier
        
    def set_outlier_model(self,outlier_model):
        self.outlier_model = outlier_model
        
    def set_cluster_obj(self,cluster_obj):
        self.cluster_obj = cluster_obj
        
    def add_outlier_points(self,outlier_points):
        self.outlier_points.append(outlier_points)
    
    def reset_outlier_bucket(self):
        self.outlier_points = []
        
    def set_score(self,score):
        self.score = score
        
    def add_d1(self,d1):
        self.d1 = d1
        
    def add_d2(self,d2):
        self.d2 = d2
        
    def calculate_threshold(self,outlier_threshold):
        self.threshold = max(self.d1,self.d2)*outlier_threshold
        
    def check_outlier(self,distance):
        if self.threshold < distance:
            result = True
        else:
            result = False
        return result
    
    def check_OCS_outlier(self,test_data):
        _result = self.outlier_model.predict([test_data])
        if _result == -1:
            result = True
            self.add_outlier_points_prediction(list(_result)[0])
        else:
            result = False
            self.add_outlier_points_prediction(list(_result)[0])
        return result
    
    def set_retarin():
        self.retrain = True

In [84]:
class birch(object):

    def __init__(self,threshold=0.5,branching_factor=40,n_clusters=None,outlier_threshold=0.7):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters
        self.outlier_threshold = outlier_threshold
        self.Birch_clusterer = Birch(threshold=self.threshold, branching_factor=self.branching_factor,
                                     n_clusters=self.n_clusters,compute_sample_indices=True)
    # Fitting the model with train_X
    def fit(self,data,y):
        self.data = data
        self.y = y
        #self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True)
        self.Birch_clusterer.fit(self.data)

    #Defines and builds the Cluster Feature Tree
    def get_cluster_tree(self):
        self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer)
        clusters = {}
        max_depth = 0
        for i in range(n_clusters):
            #print('cluster:', i)
            node = bcluster()
            sub_cluster = self.htree.flatten()[i]
            node.set_cluster_id(sub_cluster['cluster_id'])
            depth = sub_cluster.current_depth
            node.set_depth(depth)
            if depth > max_depth:
                max_depth = depth
            if i not in clusters.keys():
                clusters[i] = {}
            if sub_cluster.current_depth == 0:
                node.set_parent()
            else:
                node.set_parent(clusters[sub_cluster.parent['cluster_id']])
            cluster_size = sub_cluster['cluster_size']
            node.set_size(cluster_size)
            data_points = sub_cluster['document_id_accumulated']
            node.set_data_points(data_points)
            node.set_data_labels(self.y[sub_cluster['document_id_accumulated']])
            centroid = self.data.iloc[sub_cluster['document_id_accumulated'], :].mean(axis=0).values
            node.set_centroid(centroid)
            d1,d1_v = self.calculate_d1(centroid,data_points)
            d2 = self.calculate_d2(centroid,data_points,d1_v)
            node.add_d1(d1)
            node.add_d2(d2)
            node.calculate_threshold(self.outlier_threshold)
            clusters[i] = node
        return clusters,max_depth
    
    #Calculate the d1 distance(point farthest away from centroid)
    def calculate_d1(self,centroid,data_points):
        d1 = 0
        u = centroid
        d1_v = None
        for point in data_points:
            v = point
            distance = euclidean(u,v)
            if distance>d1:
                d1 = distance
                d1_v = v
        return d1,d1_v
    
    #Calculate the d2 distance(point farthest away from d1 and its distance from centroid)
    def calculate_d2(self,centroid,data_points,d1_v):
        d2_d1 = 0
        u = d1_v
        d2_v = None
        for point in data_points:
            v = point
            distance = euclidean(u,v)
            if distance>d2_d1:
                d2_d1 = distance
                d2_v = v
        d2 = euclidean(centroid,v)
        return d2
    
    # Display's the tree
    def show_clutser_tree(self):
        self.htree.display_tree()
        
    # Add classification model at each node and leaf
    def model_adder(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = DecisionTreeClassifier(criterion='entropy')
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            train_y_sub = self.y[sample_points]
            clf.fit(train_X_sub,train_y_sub)
            cluster_tree[cluster_id].set_classifier(clf)
        return cluster_tree
    
    def outlier_model_adder(self,cluster_tree,depth):
        for cluster_id in cluster_tree:
            #if depth != cluster_tree[cluster_id].depth:
            #    continue
            #print('building outlier',cluster_id,cluster_tree[cluster_id].depth,len(cluster_tree[cluster_id].data_points))
            _nu = ((cluster_tree[cluster_id].data_labels == -1).sum()/len(cluster_tree[cluster_id].data_labels))
            if _nu == 0:
                _nu = 0.01
            clf = OneClassSVM(kernel = 'sigmoid',gamma = 'scale',nu=0.4)
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            clf.fit(train_X_sub)
            cluster_tree[cluster_id].set_outlier_model(clf)
        return cluster_tree
        
    # Prediction Function with height based prediction with outlier detection
    def predict(self,test_X,depth,do_predict=True):
        predicted = []
        for test_instance in test_X.iterrows():
            test_sample = test_instance[1].values
            min_distance = float('inf')
            selected_cluster = None
            for cluster_id in cluster_tree:
                if cluster_tree[cluster_id].depth != depth:
                    continue
                u = cluster_tree[cluster_id].centroid
                v = np.asarray(test_sample,dtype='float64')
                distance = euclidean(u,v)
                if distance < min_distance:
                    min_distance = distance
                    selected_cluster = cluster_id
            cluster_tree[selected_cluster].add_test_points(test_instance[0])
            # Outlier identifier
            if cluster_tree[selected_cluster].check_outlier(min_distance):
                cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            if do_predict:
                _predicted_label = cluster_tree[selected_cluster].classifier.predict([test_sample])
                cluster_tree[selected_cluster].add_predicted(_predicted_label)
                predicted.append(_predicted_label)
        return predicted
    
    def distance(self,x,y):
        dist = (list(x[:,1]) - y)**2
        dist = np.sum(dist, axis=1)
        dist = np.sqrt(dist)
        ind = np.unravel_index(np.argmin(dist, axis=None), dist.shape)
        min_distance = dist[np.argmin(dist, axis=None)]
        return list(x[ind])[0],min_distance
    
    def distance_alternet(self,x,y,row_norm):
        reduced_distance = np.dot(np.matrix(y), np.matrix(x).T)
        reduced_distance *= -2
        reduced_distance += row_norm
        return np.array(np.argmin(reduced_distance, axis=1))[0][0]
    
    # New Predict
    def predict_new(self,test_X,test_y,depth,do_predict=True):
        predicted = []
        cluster_centroids = []
        cluster_centroids_ids = []
        for cluster_id in cluster_tree:
            if cluster_tree[cluster_id].depth != depth:
                continue
            cluster_tree[cluster_id].reset_outlier_bucket()
            cluster_tree[cluster_id].reset_test_lables_bucket()
            cluster_tree[cluster_id].reset_test_points_bucket()
            cluster_tree[cluster_id].reset_outlier_points_prediction_bucket()
            if cluster_tree[cluster_id].depth != depth:
                continue
            cluster_centroids_ids.append(cluster_id) # Change when using normal distance
            cluster_centroids.append(cluster_tree[cluster_id].centroid)
        cluster_centroids = np.array(cluster_centroids)
        row_norm = row_norms(cluster_centroids)
        for test_instance in test_X.iterrows():
            test_sample = np.array(test_instance[1].values)
            #selected_cluster,min_distance = self.distance_alternet(cluster_centroids,test_sample,row_norm)  # Change when using normal distance
            selected_cluster = cluster_centroids_ids[self.distance_alternet(cluster_centroids,test_sample,row_norm)]
            if cluster_tree[selected_cluster].check_OCS_outlier(test_sample):
                cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            cluster_tree[selected_cluster].add_test_points(test_instance[0])
            cluster_tree[selected_cluster].add_test_lables(test_y[test_instance[0]])
            if do_predict:
                _predicted_label = cluster_tree[selected_cluster].classifier.predict([test_sample])
                cluster_tree[selected_cluster].add_predicted(_predicted_label)
                predicted.append(_predicted_label)
        return predicted
    
    # Model certification creator
    def certify_model(self,cluster_tree,test_y):
        for cluster_id in cluster_tree:
            if len(cluster_tree[cluster_id].test_points) == 0:
                continue
            cluster_tree[cluster_id].set_test_labels(test_y[cluster_tree[cluster_id].test_points].values)
            precision = metrics.precision_score(cluster_tree[cluster_id].test_labels, 
                                                cluster_tree[cluster_id].predicted,average='weighted')
            recall = metrics.recall_score(cluster_tree[cluster_id].test_labels, 
                                          cluster_tree[cluster_id].predicted,average='weighted')
            f1_Score = metrics.f1_score(cluster_tree[cluster_id].test_labels, 
                                        cluster_tree[cluster_id].predicted,average='weighted')
            score = {'precision': precision,'recall': recall,'f1_Score': f1_Score}
            cluster_tree[cluster_id].set_score(score)
            
    # Model certification creator
    def recertify_model(self,cluster_tree):
        for cluster_id in cluster_tree:
            num_test_points = len(cluster_tree[cluster_id].test_points)
            num_outlier_points = len(cluster_tree[cluster_id].outlier_points)
            if num_outlier_points/num_test_points > 0.6:
                cluster_tree[cluster_id].set_retarin()
                
    def rebuild_models(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = DecisionTreeClassifier(criterion='entropy')
            sample_points = cluster_tree[cluster_id].data_points + cluster_tree[cluster_id].outlier_points  
            train_X_sub = self.data.iloc[sample_points,:]
            train_y_sub = self.y[sample_points]
            clf.fit(train_X_sub,train_y_sub)
            cluster_tree[cluster_id].set_classifier(clf)
        return cluster_tree

In [52]:
def load_data(path,target):
    df = pd.read_csv(path)
    print(df.shape)
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    y_test = []
    for instance in y.values:
        if instance == 'normal':
            y_test.append(1)
        else:
            y_test.append(-1)
    y_test = np.array(y_test)
    return X,y_test

def load_mutated_data(path,target):
    df = pd.read_csv(path)
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    y_test = []
    for instance in y.values:
        if instance == 'normal':
            y_test.append(1)
        else:
            y_test.append(-1)
    y_test = np.array(y_test)
    return X,y_test

In [53]:
# Cluster Driver
def cluster_driver(file,print_tree = True):
    train_X, train_y = load_data(file,'defects')
    cluster = birch(branching_factor=20)
    cluster.fit(train_X,train_y)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    cluster_tree = cluster.model_adder(cluster_tree)
    cluster_tree = cluster.outlier_model_adder(cluster_tree,max_depth)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth

In [65]:
# getting the cluster tree
file = 'Data/NSL-KDD/modified/Train/train.csv'
cluster,cluster_tree,max_depth = cluster_driver(file)

(18143, 120)
[cluster_id=0] N_children: 2 N_samples: 18143
> [cluster_id=1] N_children: 5 N_samples: 29
> > [cluster_id=2] N_children: 0 N_samples: 4
> > [cluster_id=3] N_children: 0 N_samples: 2
> > [cluster_id=4] N_children: 0 N_samples: 1
> > [cluster_id=5] N_children: 0 N_samples: 8
> > [cluster_id=6] N_children: 0 N_samples: 14
> [cluster_id=7] N_children: 20 N_samples: 18114
> > [cluster_id=8] N_children: 5 N_samples: 63
> > > [cluster_id=9] N_children: 0 N_samples: 7
> > > [cluster_id=10] N_children: 0 N_samples: 14
> > > [cluster_id=11] N_children: 0 N_samples: 17
> > > [cluster_id=12] N_children: 0 N_samples: 15
> > > [cluster_id=13] N_children: 0 N_samples: 10
> > [cluster_id=14] N_children: 2 N_samples: 261
> > > [cluster_id=15] N_children: 20 N_samples: 234
> > > > [cluster_id=16] N_children: 0 N_samples: 20
> > > > [cluster_id=17] N_children: 0 N_samples: 2
> > > > [cluster_id=18] N_children: 0 N_samples: 10
> > > > [cluster_id=19] N_children: 0 N_samples: 12
> > > > [clus

In [None]:
# Saving Model
cluster_tree_dict = {'cluster': cluster, 'cluster_tree': cluster_tree, 'max_depth': max_depth}
with open('Data/NSL-KDD/modified/birch_model.h5', 'wb') as config_dictionary_file:
    pickle.dump(cluster_tree, config_dictionary_file)

In [48]:
# Loading Test Data
file = 'Data/NSL-KDD/modified/Train/5_anomaly/A1.csv'
test_X,test_y = load_mutated_data(file,'defects')

In [55]:
# Loading Test Data
file = 'Data/NSL-KDD/modified/Train/Normal_Data/N1.csv'
test_X,test_y = load_mutated_data(file,'defects')

In [49]:
# Birch classifier score(mention depth)
print(max_depth)
depth = 0
predicted = cluster.predict_new(test_X,test_y,depth,True)

2


In [34]:
# Check for outlies detected
total = 0
j = 0
for i in cluster_tree:
    if len(cluster_tree[i].test_points) == 0:
        continue
    print("Percentage Identified",len(cluster_tree[i].data_points),len(cluster_tree[i].test_points),len(cluster_tree[i].outlier_points)/len(cluster_tree[i].test_points))
    b = collections.Counter(cluster_tree[i].test_labels)
    j += 1
    total += len(cluster_tree[i].outlier_points)
print(j,total)

Percentage Identified 1815 552 0.48007246376811596
1 265


In [35]:
# Precison Recall on outliers
test_y_1 = []
predicted_1 = []
for i in cluster_tree:
    if len(cluster_tree[i].test_points) == 0:
        continue
    test_y_1.append(cluster_tree[i].test_labels)
    predicted_1.append(cluster_tree[i].outlier_points_prediction)
_test_y = [item for sublist in test_y_1 for item in sublist]
_predicted = [item for sublist in predicted_1 for item in sublist]
print(metrics.classification_report(_test_y, _predicted))

              precision    recall  f1-score   support

          -1       1.00      0.48      0.65       552
           1       0.00      0.00      0.00         0

   micro avg       0.48      0.48      0.48       552
   macro avg       0.50      0.24      0.32       552
weighted avg       1.00      0.48      0.65       552



In [66]:
# Printing the Tree with data point distribution
tree = {'0': {}, '1': {}, '2': {}, '3': {}, '4': {}}
for i in cluster_tree:
    a = collections.Counter(cluster_tree[i].data_labels)
    tree[str(cluster_tree[i].depth)][i] = a
    dpt_print = ''
    for j in range(cluster_tree[i].depth):
        dpt_print += '> '
    print(dpt_print, "Cluster_id:",i,a,cluster_tree[i].depth)
    

 Cluster_id: 0 Counter({1: 11658, -1: 6485}) 0
>  Cluster_id: 1 Counter({1: 29}) 1
> >  Cluster_id: 2 Counter({1: 4}) 2
> >  Cluster_id: 3 Counter({1: 2}) 2
> >  Cluster_id: 4 Counter({1: 1}) 2
> >  Cluster_id: 5 Counter({1: 8}) 2
> >  Cluster_id: 6 Counter({1: 14}) 2
>  Cluster_id: 7 Counter({1: 11629, -1: 6485}) 1
> >  Cluster_id: 8 Counter({1: 63}) 2
> > >  Cluster_id: 9 Counter({1: 7}) 3
> > >  Cluster_id: 10 Counter({1: 14}) 3
> > >  Cluster_id: 11 Counter({1: 17}) 3
> > >  Cluster_id: 12 Counter({1: 15}) 3
> > >  Cluster_id: 13 Counter({1: 10}) 3
> >  Cluster_id: 14 Counter({1: 261}) 2
> > >  Cluster_id: 15 Counter({1: 234}) 3
> > > >  Cluster_id: 16 Counter({1: 20}) 4
> > > >  Cluster_id: 17 Counter({1: 2}) 4
> > > >  Cluster_id: 18 Counter({1: 10}) 4
> > > >  Cluster_id: 19 Counter({1: 12}) 4
> > > >  Cluster_id: 20 Counter({1: 9}) 4
> > > >  Cluster_id: 21 Counter({1: 18}) 4
> > > >  Cluster_id: 22 Counter({1: 2}) 4
> > > >  Cluster_id: 23 Counter({1: 14}) 4
> > > >  Cluster_i

> > > >  Cluster_id: 431 Counter({1: 24}) 4
> > > >  Cluster_id: 432 Counter({1: 30}) 4
> > > >  Cluster_id: 433 Counter({1: 21}) 4
> > > >  Cluster_id: 434 Counter({1: 24}) 4
> > > >  Cluster_id: 435 Counter({1: 14}) 4
> > > >  Cluster_id: 436 Counter({1: 32}) 4
> > > >  Cluster_id: 437 Counter({1: 19}) 4
> > >  Cluster_id: 438 Counter({1: 47}) 3
> > > >  Cluster_id: 439 Counter({1: 5}) 4
> > > >  Cluster_id: 440 Counter({1: 5}) 4
> > > >  Cluster_id: 441 Counter({1: 11}) 4
> > > >  Cluster_id: 442 Counter({1: 9}) 4
> > > >  Cluster_id: 443 Counter({1: 17}) 4
> > >  Cluster_id: 444 Counter({1: 69}) 3
> > > >  Cluster_id: 445 Counter({1: 20}) 4
> > > >  Cluster_id: 446 Counter({1: 19}) 4
> > > >  Cluster_id: 447 Counter({1: 18}) 4
> > > >  Cluster_id: 448 Counter({1: 10}) 4
> > > >  Cluster_id: 449 Counter({1: 2}) 4
> > >  Cluster_id: 450 Counter({1: 109}) 3
> > > >  Cluster_id: 451 Counter({1: 7}) 4
> > > >  Cluster_id: 452 Counter({1: 6}) 4
> > > >  Cluster_id: 453 Counter({1: 15}) 4

> > > >  Cluster_id: 831 Counter({1: 16}) 4
> > >  Cluster_id: 832 Counter({1: 67}) 3
> > > >  Cluster_id: 833 Counter({1: 18}) 4
> > > >  Cluster_id: 834 Counter({1: 9}) 4
> > > >  Cluster_id: 835 Counter({1: 15}) 4
> > > >  Cluster_id: 836 Counter({1: 16}) 4
> > > >  Cluster_id: 837 Counter({1: 9}) 4
> > >  Cluster_id: 838 Counter({1: 75}) 3
> > > >  Cluster_id: 839 Counter({1: 7}) 4
> > > >  Cluster_id: 840 Counter({1: 4}) 4
> > > >  Cluster_id: 841 Counter({1: 11}) 4
> > > >  Cluster_id: 842 Counter({1: 5}) 4
> > > >  Cluster_id: 843 Counter({1: 5}) 4
> > > >  Cluster_id: 844 Counter({1: 2}) 4
> > > >  Cluster_id: 845 Counter({1: 7}) 4
> > > >  Cluster_id: 846 Counter({1: 18}) 4
> > > >  Cluster_id: 847 Counter({1: 16}) 4
> > >  Cluster_id: 848 Counter({1: 216}) 3
> > > >  Cluster_id: 849 Counter({1: 2}) 4
> > > >  Cluster_id: 850 Counter({1: 15}) 4
> > > >  Cluster_id: 851 Counter({1: 10}) 4
> > > >  Cluster_id: 852 Counter({1: 17}) 4
> > > >  Cluster_id: 853 Counter({1: 18}) 4
> 

> > > >  Cluster_id: 1331 Counter({-1: 7, 1: 1}) 4
> > > >  Cluster_id: 1332 Counter({-1: 8}) 4
> > > >  Cluster_id: 1333 Counter({-1: 9}) 4
> > > >  Cluster_id: 1334 Counter({-1: 15, 1: 2}) 4
> > > >  Cluster_id: 1335 Counter({-1: 19}) 4
> > > >  Cluster_id: 1336 Counter({-1: 10}) 4
> > > >  Cluster_id: 1337 Counter({-1: 16}) 4
> > > >  Cluster_id: 1338 Counter({-1: 10}) 4
> > > >  Cluster_id: 1339 Counter({-1: 16}) 4
> > > >  Cluster_id: 1340 Counter({-1: 21}) 4
> > > >  Cluster_id: 1341 Counter({-1: 7, 1: 6}) 4
> > > >  Cluster_id: 1342 Counter({-1: 16}) 4
> > > >  Cluster_id: 1343 Counter({-1: 14}) 4
> > > >  Cluster_id: 1344 Counter({-1: 9, 1: 1}) 4
> > > >  Cluster_id: 1345 Counter({-1: 16}) 4
> > >  Cluster_id: 1346 Counter({-1: 68, 1: 2}) 3
> > > >  Cluster_id: 1347 Counter({1: 1, -1: 1}) 4
> > > >  Cluster_id: 1348 Counter({-1: 13, 1: 1}) 4
> > > >  Cluster_id: 1349 Counter({-1: 6}) 4
> > > >  Cluster_id: 1350 Counter({-1: 16}) 4
> > > >  Cluster_id: 1351 Counter({-1: 16}) 4
>

In [None]:
for depth in tree:
    dpt_print = ''
    for j in range(int(depth)):
        dpt_print += '> '
    for cluster in tree[depth]:
        print(dpt_print, "Cluster_id: ",cluster, 'labels: ', tree[depth][cluster])
        

In [62]:
file = 'Data/NSL-KDD/modified/Train/5_anomaly/A3.csv'
test_X,test_y = load_mutated_data(file,'defects')
predicted = cluster_tree[14].classifier.predict(test_X)
print(metrics.classification_report(test_y, predicted))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       530
           1       0.00      0.00      0.00         0

   micro avg       0.00      0.00      0.00       530
   macro avg       0.00      0.00      0.00       530
weighted avg       0.00      0.00      0.00       530



In [63]:
file = 'Data/NSL-KDD/modified/Train/Normal_Data/N3.csv'
test_X,test_y = load_mutated_data(file,'defects')
predicted = cluster_tree[14].classifier.predict(test_X)
print(metrics.classification_report(test_y, predicted))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       649
           1       0.64      1.00      0.78      1166

   micro avg       0.64      0.64      0.64      1815
   macro avg       0.32      0.50      0.39      1815
weighted avg       0.41      0.64      0.50      1815



In [81]:
a = [1,2,3,5]
b = [4,8,2,3]

In [82]:
a.append(b)

In [83]:
flat_list = [item for sublist in a for item in sublist]

TypeError: 'int' object is not iterable

In [79]:
a