How our model behave with more constraints ? We evaluate the performances of different model at different number of constraint

In [None]:
import config
from utils import print_verbose

# Import

In [None]:
## Datasets import
from pmlb import classification_dataset_names, fetch_data

In [None]:
# Constraint methods
from constraint import random_indices, get_subselection, completion_constraint

In [None]:
# Kernels methods
from kernels.features import produce_kernels, normalize_and_check_kernels

In [None]:
# Model imports
## R Model of constrained clustering
## Require to have R and have installed conclust library
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
clusterR = importr('conclust')

## Kmeans model
from sklearn.cluster import KMeans

## Our model
from models.kernel_bayes_opt import kernel_bayes_clustering

In [None]:
# Evaluation methods
from metrics import evalSplit

In [None]:
# For reproductibility
import numpy as np
np.random.seed(0)

In [None]:
# Save
import pickle

# Evolution computation

## Define variables

In [None]:
testMode = False

# Kernel to compute
kernels_name = ['rbf', 'sigmoid', 'polynomial', 
                'laplacian', 'linear', 'cosine']

# Force recomputation
force = False

# Kernels options -- Refer to kernels/features.py
kernel_args = {"normalize": "multiplicative", 
               "check_method": "trivial", 
               "clip": True}

# Explore percentage 
percentages = [0.01, 0.1, 0.5, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 75, 100]

# Selection dataset
dname = np.random.choice(classification_dataset_names)

## Read data

In [None]:
# Read data and put them in good format for sklearn
data, labelvector = fetch_data(dname, return_X_y = True, local_cache_dir = config.datadir)
data = data.astype('float64')

labels, counts = np.unique(labelvector, return_counts = True)
classes = len(labels)

print('{} : {} points in {} classes'.format(dname, len(labelvector), len(labels)))

## Computation constraint

In [None]:
## Ground truth constraints
constraintGT = 2 * np.equal.outer(labelvector, labelvector) - 1
np.fill_diagonal(constraintGT, 0)

## Computes all indices and random shuffle
indices = random_indices(train, int((len(train)-1)*len(train)/2))

## Computation kernels

In [None]:
# Computes the kernels for the given data
names, kernels = produce_kernels(dname, kernels_name, data, force = force, n_jobs = config.processor)
names, kernels = normalize_and_check_kernels(names, kernels, classes, n_jobs = config.processor, **kernel_args)    

In [None]:
assert len(kernels) > 0, "Kernels Default"

## Evaluation

In [None]:
def evaluate(percentageConstraint, iteration = 10, verbose = False):
    """
        Evaluate the clustering methods iteration times
        with the given percentage of constraints
    """
    # Iteration for confidence
    constraintIt, trainIt = {}, {}
    scoreIt, assignationIt = {}, {}
    for fold in range(iteration):
        # Split in train and test
        ## Stratified split
        train, test = [], []
        for label, count in zip(labels, counts):
            lentrain = int(0.5 * count)
            index_label = np.argwhere(labelvector == label).flatten()
            train.extend(np.random.choice(index_label, size = lentrain, replace = False).tolist())
            test.extend([i for i in index_label if i not in train])

            
        # Compute constraints matrix
        ## Number constraint
        number_constraint = int((percentageConstraint*(len(train)-1)*len(train)/2.)/100.)
        constraint = get_subselection(constraintGT, indices[:number_constraint])

        ## Completion Constraint Matrix
        if percentageConstraint < 100:
            print_verbose("Completion Constraint", verbose)
            constraint = completion_constraint(constraint)

        ## R Format constraints
        must_link, cannot_link = np.argwhere(constraint > 0), np.argwhere(constraint < 0)
        must_link, cannot_link = must_link + 1, cannot_link + 1 # +1 for R index

        
        # Computes model
        score, assignation = {}, {}
        ## R Models
        name = "ckmeans"
        print_verbose(name, verbose)
        assignation[name] = np.array(clusterR.ckmeans(data.copy(), classes, must_link, cannot_link))
        score[name] = evalSplit(assignation[name], labelvector, train)

        name = "lcvqe"
        print_verbose(name, verbose)
        assignation[name] = np.array(clusterR.lcvqe(data.copy(), classes, must_link, cannot_link))
        score[name] = evalSplit(assignation[name], labelvector, train)

        name = "mpckm"
        print_verbose(name, verbose)
        assignation[name] = np.array(clusterR.mpckm(data.copy(), classes, must_link, cannot_link))
        score[name] = evalSplit(assignation[name], labelvector, train)

        ## Kmeans
        name = "kmeans"
        print_verbose(name, verbose)
        assignation[name] = KMeans(classes).fit(data).labels_
        score[name] = evalSplit(assignation[name], labelvector, train)

        ## Our method
        name = "Bayesian Optimization"
        print_verbose(name, verbose)                                                                                                                                                           
        assignation[name], assignation[name + " Kckmeans"] = kernel_bayes_clustering(kernels, classes, constraint, verbose = verbose)
        score[name] = evalSplit(assignation[name], labelvector, train)
        score[name + " Kckmeans"] = evalSplit(assignation[name + " Kckmeans"], labelvector, train)  
        
        
        # Add results
        constraintIt[fold], trainIt[fold] = constraint, train
        scoreIt[fold], assignationIt[fold] = score, assignation
    
    
    # Save results
    info = {"Name": dname, "N_Classes": classes, "Labels": labelvector,
            "Constraint": constraintIt, "Train": trainIt,  "Score": scoreIt, "Assignation": assignationIt}
    pickle.dump(info, open(os.path.join(config.result, config.time, dname + "_{}_evolution.pickle".format(percentageConstraint)), 'wb'))
    
    return score

In [None]:
# Creates a folder for save results
import os
os.mkdir(os.path.join(config.result, config.time))

In [None]:
if testMode:
    for percentage in percentages:
        evaluate(percentage, verbose = 2)
else:
    from multiprocessing import Pool
    with Pool(config.processor) as pool:
        pool.starmap(evaluate, [(percentage, ) for percentage in percentages])