In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from graphviz import Digraph
import scipy.spatial.distance
from scipy.cluster.hierarchy import dendrogram
#Clustering birch
from freediscovery.cluster import birch_hierarchy_wrapper
from freediscovery.cluster import Birch,BirchSubcluster
#Sklearn
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn import metrics
#Learners
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#Distance measure
from scipy.spatial.distance import euclidean

import warnings

In [22]:
warnings.filterwarnings("ignore")

In [163]:
class bcluster(object):
    
    def __init__(self):
        self.parent = None
        self.parent_id = None
        self.depth = None
        self.size = None
        self.cluster_id = None
        self.data_points = []
        self.test_points = []
        self.test_labels = []
        self.predicted = []
        self.centroid = None
        self.classifier = None
        self.outlier_model = None
        self.cluster_obj = None
        self.outlier_points = []
        self.score = []
    
    def set_parent(self,parent_node=None):
        if parent_node == None:
            self.parent = None
            self.parent_id = None
        else:
            self.parent = parent_node
            self.parent_id = parent_node.cluster_id
    
    def set_depth(self,depth):
        self.depth = depth
    
    def set_size(self,size):
        self.size = size
        
    def set_cluster_id(self,cluster_id):
        self.cluster_id = cluster_id
        
    def set_data_points(self,data_points):
        self.data_points = data_points
    
    def set_test_labels(self,test_labels):
        self.test_labels = test_labels
        
    def add_test_points(self,test_point):
        self.test_points.append(test_point)
        
    def add_predicted(self,predicted):
        self.predicted.append(predicted)
    
    def set_centroid(self,centroid):
        self.centroid = centroid
        
    def set_classifier(self,classifier):
        self.classifier = classifier
        
    def set_outlier_model(self,outlier_model):
        self.outlier_model = outlier_model
        
    def set_cluster_obj(self,cluster_obj):
        self.cluster_obj = cluster_obj
        
    def set_outlier_points(self,outlier_points):
        self.outlier_points = outlier_points
        
    def set_score(self,score):
        self.score = score

In [277]:
class birch(object):

    def __init__(self,threshold=0.7,branching_factor=20,n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters
        self.Birch_clusterer = Birch(threshold=self.threshold, branching_factor=self.branching_factor,
                                     n_clusters=self.n_clusters,compute_sample_indices=True)
    
    def fit(self,data,y):
        self.data = data
        self.y = y
        #self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True)
        self.Birch_clusterer.fit(self.data)

    def get_cluster_tree(self):
        self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer)
        clusters = {}
        max_depth = 0
        for i in range(n_clusters):
            node = bcluster()
            sub_cluster = self.htree.flatten()[i]
            node.set_cluster_id(sub_cluster['cluster_id'])
            depth = sub_cluster.current_depth
            if depth > max_depth:
                max_depth = depth
            if i not in clusters.keys():
                clusters[i] = {}
            if sub_cluster.current_depth == 0:
                node.set_parent()
            else:
                node.set_parent(clusters[sub_cluster.parent['cluster_id']])
            node.set_depth(sub_cluster.current_depth)
            node.set_size(sub_cluster['cluster_size'])
            node.set_data_points(sub_cluster['document_id_accumulated'])
            centroid = self.data.iloc[sub_cluster['document_id_accumulated'], :].mean(axis=0).values
            node.set_centroid(centroid)
            clusters[i] = node
        return clusters,max_depth
    
    def show_clutser_tree(self):
        self.htree.display_tree()
        
    def model_adder(self,cluster_tree):
        for cluster_id in cluster_tree:
            clf = DecisionTreeClassifier(criterion='entropy')
            sample_points = cluster_tree[cluster_id].data_points
            train_X_sub = self.data.iloc[sample_points,:]
            train_y_sub = self.y.iloc[sample_points]
            clf.fit(train_X_sub,train_y_sub)
            cluster_tree[cluster_id].set_classifier(clf)
        return cluster_tree
        
    def predict(self,test_X,depth):
        predicted = []
        for test_instance in test_X.iterrows():
            test_sample = test_instance[1].values
            min_distance = float('inf')
            selected_cluster = None
            for cluster_id in cluster_tree:
                if cluster_tree[cluster_id].depth != depth:
                    continue
                u = cluster_tree[cluster_id].centroid
                v = np.asarray(test_sample,dtype='float64')
                distance = euclidean(u,v)
                if distance < min_distance:
                    min_distance = distance
                    selected_cluster = cluster_id
            cluster_tree[selected_cluster].add_test_points(test_instance[0])
            _predicted_label = cluster_tree[selected_cluster].classifier.predict([test_sample])
            cluster_tree[selected_cluster].add_predicted(_predicted_label)
            predicted.append(_predicted_label)
        return predicted
    
    def certify_model(self,cluster_tree,test_y):
        for cluster_id in cluster_tree:
            if len(cluster_tree[cluster_id].test_points) == 0:
                continue
            cluster_tree[cluster_id].set_test_labels(test_y[cluster_tree[cluster_id].test_points].values)
            score = metrics.classification_report(cluster_tree[cluster_id].test_labels, cluster_tree[cluster_id].predicted)
            cluster_tree[cluster_id].set_score(score)

In [278]:
def load_data(path,target):
    df = pd.read_csv(path)
    if path == 'data/jm1.csv':
        df = df[~df.uniq_Op.str.contains("\?")]
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)
    return train_X, test_X, train_y, test_y

In [279]:
# Cluster Driver
def cluster_driver(file,print_tree = False):
    train_X, test_X, train_y, test_y = load_data(file,'defects')
    cluster = birch(branching_factor=20)
    cluster.fit(train_X,train_y)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    cluster_tree = cluster.model_adder(cluster_tree)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth

In [280]:
# getting the cluster tree
file = 'data/JDT.csv'
cluster,cluster_tree,max_depth = cluster_driver(file)

In [281]:
# Base performance score
clf = DecisionTreeClassifier(criterion='entropy')
train_X, test_X, train_y, test_y = load_data(file,'defects')
clf.fit(train_X, train_y)
predicted = clf.predict(test_X)
print(metrics.classification_report(test_y, predicted))

              precision    recall  f1-score   support

       False       0.86      0.88      0.87       258
        True       0.53      0.49      0.51        72

   micro avg       0.79      0.79      0.79       330
   macro avg       0.70      0.68      0.69       330
weighted avg       0.79      0.79      0.79       330



In [282]:
def load_mutated_data(path,target):
    train_X, test_X, train_y, test_y = load_data(path,target)
    test_X = pd.concat([train_X,test_X])
    test_y = pd.concat([train_y,test_y])
    return test_X,test_y

In [283]:
# Birch classifier score(mention depth)
file = 'data/JDT_1.csv'
test_X,test_y = load_mutated_data(file,'defects')
depth = max_depth
predicted = cluster.predict(test_X,depth)
print(metrics.classification_report(test_y, predicted))
cluster.certify_model(cluster_tree,test_y)

              precision    recall  f1-score   support

       False       0.80      0.59      0.68       254
        True       0.27      0.50      0.35        76

   micro avg       0.57      0.57      0.57       330
   macro avg       0.53      0.54      0.51       330
weighted avg       0.67      0.57      0.60       330



In [284]:
for i in cluster_tree:
    if len(cluster_tree[i].test_points) == 0:
        continue
    print(cluster_tree[i].score)

              precision    recall  f1-score   support

       False       0.00      0.00      0.00        34
        True       0.13      1.00      0.23         5

   micro avg       0.13      0.13      0.13        39
   macro avg       0.06      0.50      0.11        39
weighted avg       0.02      0.13      0.03        39

              precision    recall  f1-score   support

       False       1.00      1.00      1.00         3

   micro avg       1.00      1.00      1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

              precision    recall  f1-score   support

       False       0.80      0.67      0.73         6
        True       0.71      0.83      0.77         6

   micro avg       0.75      0.75      0.75        12
   macro avg       0.76      0.75      0.75        12
weighted avg       0.76      0.75      0.75        12

              precision    recall  f1-score   support

       False      

In [51]:
c = cluster()