This notebook allows to compute different clustering algorithms on the pmlb datasets

In [None]:
import config
from utils import print_verbose

# Import

In [None]:
## Datasets import
from pmlb import classification_dataset_names, fetch_data

In [None]:
# Constraint methods
from constraint import random_indices, get_subselection, completion_constraint

In [None]:
# Model imports
## R Model of constrained clustering
## Require to have R and have installed conclust library
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
clusterR = importr('conclust')

## Kmeans model
from sklearn.cluster import KMeans

## Our model


In [None]:
# Evaluation methods
from metrics import evalSplit

In [None]:
# For reproductibility
import numpy as np
np.random.seed(42)

First we define a function which open the data, compute the cosntraints and then computes the different algorithm and save the results.  
All configuration are in `config.py`

In [None]:
def computeAndSavePerf(dname, percentageConstraint = 100, verbose = 0):
    """
        Computes the different algorithms and
        Saves the performances on the dataset dname
        
        percentageConstraint for the constraint matrix (between 0 and 100)
    """
    assert dname in classification_dataset_names, "Unknown dataset"
    
    # Read data and put them in good format for sklearn
    data, labelvector = fetch_data(dname, return_X_y = True, local_cache_dir = config.datadir)
    data = data.astype('float64')
    
    # Split in train and test
    ## Stratified split
    train, test = [], []
    labels, counts = np.unique(labelvector, return_counts = True)
    for label, count in zip(labels, counts):
        lentrain = int(0.5 * count)
        index_label = np.argwhere(labelvector == label).flatten()
        train.extend(np.random.choice(index_label, size = lentrain, replace = False).tolist())
        test.extend([i for i in index_label if i not in train])
    
    print_verbose('{} : {} in {} classes'.format(dname, len(labelvector), len(labels)), verbose)
    
    # Compute constraints matrix
    ## Number constraint
    number_constraint = int((percentageConstraint*(len(train)-1)*len(train)/2.)/100.)
    

    ## Ground truth constraints
    constraint = 2 * np.equal.outer(labelvector, labelvector) - 1
    np.fill_diagonal(constraint, 0)

    ## Indices computed only on train part
    indices = random_indices(train, number_constraint)
    constraint = get_subselection(constraint, indices)

    ## Completion Constraint Matrix
    print_verbose("Completion Constraint", verbose)
    constraint = completion_constraint(constraint)
    
    ## R Format constraints
    must_link, cannot_link = np.argwhere(constraint > 0), np.argwhere(constraint < 0)
    must_link, cannot_link = must_link + 1, cannot_link + 1 # +1 for R index
    
    
    # Computes model
    score = {}
    ## R Models
    name = "ccls"
    print_verbose(name, verbose)                                                                                                                                                       
    assignation = np.array(clusterR.ccls(data, classes, must_link, cannot_link))
    score[name] = evalSplit(assignation, labelvector, train)

    name = "ckmeans"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = np.array(clusterR.ckmeans(data, classes, must_link, cannot_link))
    score[name] = evalSplit(assignation, labelvector, train)

    name = "lcvqe"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = np.array(clusterR.lcvqe(data, classes, must_link, cannot_link))
    score[name] = evalSplit(assignation, labelvector, train)

    name = "mpckm"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = np.array(clusterR.mpckm(data, classes, must_link, cannot_link))
    score[name] = evalSplit(assignation, labelvector, train)
    
    ## Kmeans
    name = "kmeans"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = KMeans(classes).fit(data).labels_
    score[name] = evalSplit(assignation, labelvector, train)
    
    ## Our method
    
    # Save results
    info = {"Name": dname, "N_Classes": classes, "Constraint": constraint, 
            "Labels": labelvector, "Train": train,  "Score": score}
    pickle.dump(info, open(join(config.result, config.time, dname + "_results.pickle"), 'wb'))
    
    return score

# Execution

In [None]:
# Creates a folder for save results
import os
os.mkdir(join(config.result, config.time))

In [None]:
if True or config.processor == 1:
    for dname in classification_dataset_names:
        computeAndSavePerf(dname, verbose = 1)
else:
    from multiprocessing import Pool
    with Pool(config.processor) as pool:
        pool.starmap(computeAndSavePerf, [(dname, ) for dname in classification_dataset_names])