This notebook allows to compute different clustering algorithms on the pmlb datasets

All path configuration and number of processor to use are in `config.py`

In [None]:
import config
from utils import print_verbose

# Import

In [None]:
# Configuration computation
from fold import createFold, readFold

In [None]:
# Datasets import
from pmlb import classification_dataset_names, fetch_data

In [None]:
# Constraint methods
from constraint import random_indices, get_subselection, completion_constraint

In [None]:
# Kernels methods
from kernels.features import produce_kernels, normalize_and_check_kernels

In [None]:
# Model imports
## R Model of constrained clustering
## Require to have R and have installed conclust library
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
clusterR = importr('conclust')

## Kmeans model
from sklearn.cluster import KMeans

## Our model
from models.kernel_bayes_opt import kernel_bayes_clustering

In [None]:
# Evaluation methods
from metrics import evalSplit

In [None]:
# For reproductibility
import numpy as np
np.random.seed(0)

In [None]:
# Save
import pickle
import os

In [None]:
# Creates folder for save
os.mkdir(os.path.join(config.result, config.time))

# Configuration Computation

In order to compute separetly any method, we save the configuration that we want to apply for all methods

In [None]:
for dname in sorted(classification_dataset_names):
    createFold(dname, verbose = 2)

# Models computation

Second we define a function which opens the data, computes the constraints and then computes the different algorithms and saves the results.  

In [None]:
testMode = False

# Kernels
## Kernel to compute
kernels_name = ['rbf', 'sigmoid', 'polynomial', 
                'laplacian', 'linear']

## Force recomputation kernels
force = False

## Kernels options -- Refer to kernels/features.py
### Used only with Bayesian Optimization method
kernel_args = {"normalize": "expectation", 
               "check_method": "trivial", 
               "clip": True}

In [None]:
def compute(dname, algorithm, percentageConstraint = 100, verbose = 0, n_jobs = 1):
    """
        Computes the given algorithm(s) and
        Saves the performances on the dataset dname
        
        percentageConstraint for the constraint matrix (between 0 and 100)
    """
    # Read configuration
    configuration = readFold(dname)
    if configuration is None:
        print_verbose("No configuration found for {}".format(dname), verbose)
        return None
    
    # Read data and put them in good format for sklearn
    data, labelvector = fetch_data(dname, return_X_y = True, local_cache_dir = config.datadir)
    data = data.astype('float64')
    classes = configuration["N_Classes"]
    
    # Computes the kernels for the given data
    print_verbose("Computation Kernels", verbose)
    names, kernels = produce_kernels(dname, kernels_name, data, force = force, verbose = verbose, n_jobs = n_jobs)
    
    print_verbose("Normalization Kernels", verbose)
    names, kernels = normalize_and_check_kernels(names, kernels, classes, verbose = verbose, n_jobs = n_jobs, **kernel_args)
    
    ## If no kernel => Change computation 
    if len(kernels) == 0:
        print_verbose("Kernels Default", verbose)
        return None
    
    
    # Ground truth constraint matrix
    constraintGT = 2 * np.equal.outer(labelvector, labelvector) - 1
    np.fill_diagonal(constraintGT, 0)
    
    
    # Iteration for confidence
    scoreIt, assignationIt, constraintIt = {}, {}, {}
    for fold in configuration["Train"]:
        print_verbose("Iteration {} / {}".format(fold + 1, len(configuration["Train"])), verbose)

        # Read precomputed indices for train and constraint
        train, constraint_indices = configuration["Train"][fold], configuration["Constraint"][fold]
        
        
        # Compute constraints matrix
        ## Number constraint
        number_constraint = int((percentageConstraint*(len(train)-1)*len(train)/2.)/100.)

        ## Subselect the constraint matrix
        constraint = get_subselection(constraintGT, constraint_indices[:number_constraint])

        ## Completion Constraint Matrix
        if percentageConstraint < 100:
            print_verbose("Completion Constraint", verbose)
            constraint = completion_constraint(constraint)
        
        ## Stop if no constraint
        if np.mean(np.abs(constraint)) == 0:
            print_verbose("No constraint", verbose)
            continue
            
        ## R Format constraints
        must_link, cannot_link = np.argwhere(constraint > 0), np.argwhere(constraint < 0)
        must_link, cannot_link = must_link + 1, cannot_link + 1 # +1 for R index


        # Computes model(s)
        assignation, score = {}, {}
        
        try:
            ## R Models
            if "ckmeans" in algorithm:
                print_verbose("ckmeans", verbose)                                                   
                assignation["ckmeans"] = np.array(clusterR.ckmeans(data, classes, must_link, cannot_link))
                score["ckmeans"] = evalSplit(assignation["ckmeans"], labelvector, train)

            if "lcvqe" in algorithm:
                print_verbose("lcvqe", verbose)                                                  
                assignation["lcvqe"] = np.array(clusterR.lcvqe(data, classes, must_link, cannot_link))
                score["lcvqe"] = evalSplit(assignation["lcvqe"], labelvector, train)

            if "mpckm" in algorithm:
                print_verbose("mpckm", verbose)                                  
                assignation["mpckm"] = np.array(clusterR.mpckm(data, classes, must_link, cannot_link))
                score["mpckm"] = evalSplit(assignation["mpckm"], labelvector, train)

            if "kmeans" in algorithm:
                print_verbose("kmeans", verbose)        
                assignation["kmeans"] = KMeans(classes).fit(data).labels_
                score["kmeans"] = evalSplit(assignation["kmeans"], labelvector, train)

            ## Our method
            if "Bayesian Optimization" in algorithm:
                print_verbose("Bayesian Optimization", verbose)                                                                                   
                assignation["Bayesian Optimization"], assignation["Bayesian Optimization Kckmeans"] = kernel_bayes_clustering(kernels, classes, constraint, verbose = verbose)
                score["Bayesian Optimization"] = evalSplit(assignation["Bayesian Optimization"], labelvector, train)
                score["Bayesian Optimization Kckmeans"] = evalSplit(assignation["Bayesian Optimization Kckmeans"], labelvector, train)  


            # Add results
            constraintIt[fold] = np.mean(np.abs(constraint))
            scoreIt[fold], assignationIt[fold] = score, assignation
                    
        except:
            print_verbose("No clustering respecting the constraints", verbose)
            continue

    # Save results
    info = {"Name": dname, "Score": scoreIt, "Assignation": assignationIt, "Percentage Constraint": constraintIt}
    pickle.dump(info, open(os.path.join(config.result, config.time, dname + "_{}.pickle".format(percentageConstraint)), 'wb'))
    
    return info

# Computation

In [None]:
algorithms = ["ckmeans", "kmeans", "lcvqe", "mpckm", "Bayesian Optimization"]

In [None]:
# Run method
if testMode:
    for dname in classification_dataset_names:
        compute(dname, algorithms, verbose = 2, n_jobs = config.processor)
else:
    from multiprocessing import Pool
    with Pool(config.processor) as pool:
        pool.starmap(compute, [(dname, algorithms) for dname in classification_dataset_names])

------

# Evolution 

How our model behave with less constraints ? We evaluate the performances of different model at different number of constraint

In [None]:
# Percentage of constraints to explore
percentages = [0.1, 0.5, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 75, 100]

# Selection dataset
dname = np.random.choice(classification_dataset_names)

In [None]:
# Run method
if testMode:
    for percentage in percentages:
        compute(dname, algorithms, percentageConstraint = percentage, verbose = 2, n_jobs = config.processor)
else:
    from multiprocessing import Pool
    with Pool(config.processor) as pool:
        pool.starmap(compute, [(dname, algorithms, percentage) for percentage in percentages])