This notebook allows to compute different clustering algorithms on the pmlb datasets

All path configuration and number of processor to use are in `config.py`

In [1]:
import config
from utils import print_verbose

Ran on 07 March 2019 14:20:32
Server


# Import

In [2]:
## Datasets import
from pmlb import classification_dataset_names, fetch_data

In [3]:
# Constraint methods
from constraint import random_indices, get_subselection, completion_constraint

In [4]:
# Kernels methods
from kernels.features import produce_kernels, normalize_and_check_kernels

In [5]:
# Model imports
## R Model of constrained clustering
## Require to have R and have installed conclust library
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
clusterR = importr('conclust')

## Kmeans model
from sklearn.cluster import KMeans

## Our model
from models.kernel_bayes_opt import kernel_bayes_clustering


 



In [6]:
# Evaluation methods
from metrics import evalSplit

In [7]:
# For reproductibility
import numpy as np
np.random.seed(0)

In [12]:
# Save
import pickle

# Models computation

Second we define a function which opens the data, computes the constraints and then computes the different algorithms and saves the results.  

In [13]:
testMode = False

# Kernel to compute
kernels_name = ['rbf', 'sigmoid', 'polynomial', 
                'laplacian', 'linear', 'cosine']

# Force recomputation
force = False

# Max number points in dataset - for limiting memory consumption
min_points, max_points = 100, 4000

# Kernels options -- Refer to kernels/features.py
kernel_args = {"normalize": "multiplicative", 
               "check_method": "trivial", 
               "clip": True}

In [17]:
def computeAndSavePerf(dname, percentageConstraint = 100, verbose = 0, n_jobs = 1):
    """
        Computes the different algorithms and
        Saves the performances on the dataset dname
        
        percentageConstraint for the constraint matrix (between 0 and 100)
    """
    assert dname in classification_dataset_names, "Unknown dataset"
    
    # Read data and put them in good format for sklearn
    data, labelvector = fetch_data(dname, return_X_y = True, local_cache_dir = config.datadir)
    data = data.astype('float64')
    
    if len(labelvector) < min_points:
        print_verbose('Dataset too small - {}'.format(len(labelvector)), verbose)
        return {}
    if len(labelvector) > max_points:
        print_verbose('Dataset too big - {}'.format(len(labelvector)), verbose)
        return {}
    
    # Split in train and test
    ## Stratified split
    train, test = [], []
    labels, counts = np.unique(labelvector, return_counts = True)
    classes = len(labels)
    for label, count in zip(labels, counts):
        lentrain = int(0.5 * count)
        index_label = np.argwhere(labelvector == label).flatten()
        train.extend(np.random.choice(index_label, size = lentrain, replace = False).tolist())
        test.extend([i for i in index_label if i not in train])
    
    print_verbose('{} : {} points in {} classes'.format(dname, len(labelvector), len(labels)), verbose)
    
    
    # Compute constraints matrix
    ## Number constraint
    number_constraint = int((percentageConstraint*(len(train)-1)*len(train)/2.)/100.)

    ## Ground truth constraints
    constraint = 2 * np.equal.outer(labelvector, labelvector) - 1
    np.fill_diagonal(constraint, 0)

    ## Indices computed only on train part
    indices = random_indices(train, number_constraint)
    constraint = get_subselection(constraint, indices)

    ## Completion Constraint Matrix
    if percentageConstraint < 100:
        print_verbose("Completion Constraint", verbose)
        constraint = completion_constraint(constraint)
    
    ## R Format constraints
    must_link, cannot_link = np.argwhere(constraint > 0), np.argwhere(constraint < 0)
    must_link, cannot_link = must_link + 1, cannot_link + 1 # +1 for R index
    
    
    # Computes the kernels for the given data
    print_verbose("Computation Kernels", verbose)
    names, kernels = produce_kernels(dname, kernels_name, data, force = force, verbose = verbose, n_jobs = n_jobs)
    
    print_verbose("Normalization Kernels", verbose)
    names, kernels = normalize_and_check_kernels(names, kernels, classes, verbose = verbose, n_jobs = n_jobs, **kernel_args)
    
    ## If no kernel verify the defintion 
    if len(kernels) == 0:
        print_verbose("Kernels Default", verbose)
        return None
    
    # Computes model
    score = {}
    ## R Models
    name = "ckmeans"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = np.array(clusterR.ckmeans(data, classes, must_link, cannot_link))
    score[name] = evalSplit(assignation, labelvector, train)

    name = "lcvqe"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = np.array(clusterR.lcvqe(data, classes, must_link, cannot_link))
    score[name] = evalSplit(assignation, labelvector, train)

    name = "mpckm"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = np.array(clusterR.mpckm(data, classes, must_link, cannot_link))
    score[name] = evalSplit(assignation, labelvector, train)
    
    ## Kmeans
    name = "kmeans"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = KMeans(classes).fit(data).labels_
    score[name] = evalSplit(assignation, labelvector, train)
    
    ## Our method
    name = "Bayesian Optimization"
    print_verbose(name, verbose)                                                                                                                                                           
    assignation = kernel_bayes_clustering(kernels, classes, constraint, verbose = verbose)
    score[name] = evalSplit(assignation, labelvector, train)
    
    
    # Save results
    info = {"Name": dname, "N_Classes": classes, "Constraint": constraint, 
            "Labels": labelvector, "Train": train,  "Score": score}
    pickle.dump(info, open(os.path.join(config.result, config.time, dname + "_results.pickle"), 'wb'))
    
    return score

# Execution

In [18]:
# Creates a folder for save results
import os
os.mkdir(os.path.join(config.result, config.time))

FileExistsError: [Errno 17] File exists: '/zfsauton/data/public/vjeanselme/PennResults/07 March 2019 14:20:32'

In [19]:
if testMode:
    for dname in classification_dataset_names:
        computeAndSavePerf(dname, verbose = 2, n_jobs = config.processor)
else:
    from multiprocessing import Pool
    with Pool(config.processor) as pool:
        pool.starmap(computeAndSavePerf, [(dname, ) for dname in classification_dataset_names])

KeyError: 0