In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from graphviz import Digraph
import scipy.spatial.distance
from scipy.cluster.hierarchy import dendrogram
#Clustering birch
from freediscovery.cluster import birch_hierarchy_wrapper
from freediscovery.cluster import Birch,BirchSubcluster
#Sklearn
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn import metrics
#Learners
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
#Distance measure
from scipy.spatial.distance import euclidean

import warnings

In [2]:
warnings.filterwarnings("ignore")

In [25]:
class birch(object):

    def __init__(self,threshold=0.5,branching_factor=20,n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters
        self.Birch_clusterer = Birch(threshold=self.threshold, branching_factor=self.branching_factor,
                                     n_clusters=self.n_clusters,compute_sample_indices=True)
    
    def fit(self,data,y):
        self.data = data
        self.y = y
        #self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True)
        self.Birch_clusterer.fit(self.data)

    def get_cluster_tree(self):
        self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer)
        clusters = {}
        max_depth = 0
        for i in range(n_clusters):
            sub_cluster = self.htree.flatten()[i]
            depth = sub_cluster.current_depth
            if depth > max_depth:
                max_depth = depth
            if depth not in clusters.keys():
                clusters[depth] = {}
            if i not in clusters[depth].keys():
                clusters[depth][i] = {}
            if sub_cluster.current_depth == 0:
                clusters[depth][i]['parent'] = None
            else:
                clusters[depth][i]['parent'] = sub_cluster.parent['cluster_id']
            clusters[depth][i]['depth'] = sub_cluster.current_depth
            clusters[depth][i]['size'] = sub_cluster['cluster_size']
            clusters[depth][i]['data_points'] = sub_cluster['document_id_accumulated']
            clusters[depth][i]['centroid'] = self.data.iloc[sub_cluster['document_id_accumulated'], :].mean(axis=0).values
        return clusters,max_depth
    
    def show_clutser_tree(self):
        self.htree.display_tree()
        
    def model_adder(self,cluster_tree):
        for depth in cluster_tree:
            for cluster_id in cluster_tree[depth]:
                clf = DecisionTreeClassifier(criterion='entropy')
                sample_points = cluster_tree[depth][cluster_id]['data_points']
                train_X_sub = self.data.iloc[sample_points,:]
                train_y_sub = self.y.iloc[sample_points]
                clf.fit(train_X_sub,train_y_sub)
                cluster_tree[depth][cluster_id]['clf'] = clf
        return cluster_tree
        
    def predict(self,data,depth):
        depth = 0
        predicted = []
        for i in range(test_X.shape[0]):
            test_sample = test_X.iloc[i].tolist()
            min_distance = float('inf')
            selected_cluster = None
            for cluster_id in cluster_tree[depth]:
                u = cluster_tree[depth][cluster_id]['centroid']
                v = np.asarray(test_sample,dtype='float64')
                distance = euclidean(u,v)
                if distance < min_distance:
                    min_distance = distance
                    selected_cluster = cluster_id
            predicted.append(cluster_tree[depth][selected_cluster]['clf'].predict([test_sample]))
        return predicted

In [44]:
def load_data(path,target):
    df = pd.read_csv(path)
    if path == 'data/jm1.csv':
        df = df[~df.uniq_Op.str.contains("\?")]
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)
    return train_X, test_X, train_y, test_y

In [45]:
# Cluster Driver
def cluster_driver(file,print_tree = False):
    train_X, test_X, train_y, test_y = load_data(file,'defects')
    cluster = birch(branching_factor=20)
    cluster.fit(train_X,train_y)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    cluster_tree = cluster.model_adder(cluster_tree)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth

In [46]:
# getting the cluster tree
file = 'data/jm1.csv'
cluster,cluster_tree,max_depth = cluster_driver(file)

In [47]:
# Base performance score
clf = DecisionTreeClassifier(criterion='entropy')
train_X, test_X, train_y, test_y = load_data(file,'defects')
clf.fit(train_X, train_y)
predicted = clf.predict(test_X)
print(metrics.classification_report(test_y, predicted))

              precision    recall  f1-score   support

       False       0.84      0.85      0.85      2885
        True       0.37      0.35      0.36       706

   micro avg       0.76      0.76      0.76      3591
   macro avg       0.61      0.60      0.61      3591
weighted avg       0.75      0.76      0.75      3591



In [48]:
def load_mutated_data(path,target):
    train_X, test_X, train_y, test_y = load_data(path,target)
    test_X = pd.concat([train_X,test_X])
    test_y = pd.concat([train_y,test_y])
    return test_X,test_y

In [65]:
# Birch classifier score(mention depth)
file = 'data/jm1_19.csv'
test_X,test_y = load_mutated_data(file,'defects')
depth = max_depth
predicted = cluster.predict(test_X,depth)
print(metrics.classification_report(test_y, predicted))

              precision    recall  f1-score   support

       False       0.76      0.14      0.24      2794
        True       0.22      0.84      0.35       797

   micro avg       0.30      0.30      0.30      3591
   macro avg       0.49      0.49      0.29      3591
weighted avg       0.64      0.30      0.26      3591

