In [1]:
import numpy as np
from numpy import array
from hyperPipes import HyperPipes
from classDecomposition import ClassDecomposition
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [2]:
class HyperPipe:
    
    def __init__(self):
        self.n_dimensions = 0
        self.numerical_bounds = []

    def fit(self, data_x, target_class):
        self.target_class = target_class
        self.n_dimensions = data_x.shape[1]

        # Initializes bounds
        for i in range(self.n_dimensions):
            bounds = []
            bounds.append(float('+inf'))  # lower bound
            bounds.append(float('-inf'))  # upper bound
            self.numerical_bounds.append(bounds)

        # Add instances
        for i in range(data_x.shape[0]):
            self.__add_instance__(data_x[i])

        return None

    def __add_instance__(self, data_x):
        #check boundaries
        for i in range(self.n_dimensions):
            if(data_x[i] < self.numerical_bounds[i][0]):
                self.numerical_bounds[i][0] = data_x[i]
            if(data_x[i] > self.numerical_bounds[i][1]):
                self.numerical_bounds[i][1] = data_x[i]

        return None

    def partial_contains(self, data_x):
        count = 0
        for i in range(self.n_dimensions):
            if(data_x[i] > self.numerical_bounds[i][0] and data_x[i] < self.numerical_bounds[i][1]):
                count += 1
        score = float(count) / self.n_dimensions

        return (score, self.target_class)

In [3]:
class HyperPipes:

    def __init__(self):
        self.hyper_pipes = []

    def fit(self, data_x, data_y):
        self.y_unique_values, self.y_unique_indices = np.unique(
            data_y, return_inverse=True)
        self.n_y_unique = self.y_unique_values.shape[0]
        self.hyper_pipes = [HyperPipe() for i in range(self.n_y_unique)]

        for i in range(self.n_y_unique):
            target_class = self.y_unique_values[i]
            target_class_indices = np.where(data_y == target_class)
            data_x_filtered = data_x[target_class_indices]
            self.hyper_pipes[i].fit(data_x_filtered, target_class)

        return self

    def predict(self, data_x):
        #scores = []
        predictions = []
        for instance in data_x:
            partial_results = []
            for i in range(self.n_y_unique):
                partial_results.append(self.hyper_pipes[i].partial_contains(instance))
            best = max(partial_results,key=lambda item:item[0])[1]
            predictions.append(best)

        return predictions

In [18]:
class ClassDecomposition:
    
    def __init__(self, algorithm, k):
        self.algorithm = algorithm
        self.k = k

    def decompose(self, data_x, data_y):
        self.y_unique_values, self.y_unique_indices = np.unique(data_y, return_inverse=True)
        self.n_y_unique = self.y_unique_values.shape[0]

        renamed_data_x = np.array([])
        renamed_data_y = np.array([])
        for i in range(self.n_y_unique):
            target_class = self.y_unique_values[i]
            target_class_indices = np.where(data_y == target_class)

            data_i_x = data_x[target_class_indices]
            data_i_y = data_y[target_class_indices]
            fitted = self.algorithm.fit(data_i_x)
            labels = fitted.labels_
            
            for j in range(self.k):
                indices_j = np.where(labels == j)
                data_j_x = data_i_x[indices_j]
                data_j_y = data_i_y[indices_j]

                relabed = self.relabel(data_j_y, j)

                renamed_data_x = np.append(renamed_data_x, data_j_x)
                renamed_data_y = np.append(renamed_data_y, relabed)

        X = np.reshape(renamed_data_x, (data_x.shape[0], data_x.shape[1]))
        y = renamed_data_y
        
        return X, y

    def relabel(self, data, cluster_idx):
        return np.array(list(map(lambda item: str(item) + chr(65 + cluster_idx), data)))

In [19]:
dataset = datasets.load_breast_cancer()
X, y = dataset.data, dataset.target

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
X.shape

(569, 30)

In [22]:
y.shape

(569,)

In [23]:
hp = HyperPipes()
y_predicted = hp.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predicted)
print('HyperPipes:\t' + str(accuracy))

HyperPipes:	0.7925531914893617


In [24]:
classifier = GaussianNB()
y_predicted = classifier.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predicted)
print('GaussianNB:\t' + str(accuracy))

GaussianNB:	0.9414893617021277


In [25]:
k = 2
kmeans = KMeans(n_clusters=k, random_state=0)
cd = ClassDecomposition(kmeans, k)
X_decomposed, y_decomposed = cd.decompose(X,y)

In [26]:
print('before: ' + str(len(set(y))))
print('after : ' + str(len(set(y_decomposed))))

before: 2
after : 4


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_decomposed, y_decomposed, test_size=0.33, random_state=42)

In [28]:
X_decomposed.shape

(569, 30)

In [29]:
y_decomposed.shape

(569,)

In [34]:
hp = HyperPipes()
y_predicted = hp.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predicted)
print('HyperPipes:\t' + str(accuracy))

HyperPipes:	0.8723404255319149


In [32]:
classifier = GaussianNB()
y_predicted = classifier.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predicted)
print('GaussianNB:\t' + str(accuracy))

GaussianNB:	0.9361702127659575
