In [3]:

 def generate_noisy_labels(y, noise_matrix, verbose=False):
    """Generates noisy labels s (shape (N, 1)) from perfect labels y,
    'exactly' yielding the provided noise_matrix between s and y.
    Below we provide a for loop implementation of what this function does.
    We do not use this implementation as it is not a fast algorithm, but
    it explains as Python pseudocode what is happening in this function.
    Parameters
    ----------
    y : np.array (shape (N, 1))
        Perfect labels, without any noise. Contains K distinct natural number
        classes, e.g. 0, 1,..., K-1
    noise_matrix : np.array of shape (K, K), K = number of classes
        A conditional probablity matrix of the form P(s=k_s|y=k_y) containing
        the fraction of examples in every class, labeled as every other class.
        Assumes columns of noise_matrix sum to 1.
    Examples
    --------
    .. code:: python
        # Generate s
        count_joint = (noise_matrix * py * len(y)).round().astype(int)
        s = np.array(y)
        for k_s in range(K):
            for k_y in range(K):
                if k_s != k_y:
                    idx_flip = np.where((s==k_y)&(y==k_y))[0]
                    if len(idx_flip): # pragma: no cover
                        s[np.random.choice(
                            idx_flip,
                            count_joint[k_s][k_y],
                            replace=False,
                        )] = k_s
    """

    # Make y a numpy array, if it is not
    y = np.asarray(y)

    # Number of classes
    K = len(noise_matrix)

    # Compute p(y=k)
    py = value_counts(y) / float(len(y))

    # Counts of pairs (s, y)
    count_joint = (noise_matrix * py * len(y)).astype(int)
    # Remove diagonal entries as they do not involve flipping of labels.
    np.fill_diagonal(count_joint, 0)
    
    print("------------------------------------------")
    # Generate s
    s = np.array(y)
    for k in range(K):  # Iterate over true class y == k
        print("k = ", k)
        # Get the noisy s labels that have non-zero counts
        s_labels = np.where(count_joint[:, k] != 0)[0]
        print("noisy labels that have non-zero counts = ", s_labels)
        # Find out how many of each noisy s label we need to flip to
        s_counts = count_joint[s_labels, k]
        print("how many of each noisy label we need to flip = ", s_counts)
        # Create a list of the new noisy labels
        noise = [s_labels[i] for i, c in enumerate(s_counts) for z in range(c)]
        print("list of new noisy labels = ", noise)
        # Randomly choose y labels for class k and set them to the noisy labels.
        idx_flip = np.where((s == k) & (y == k))[0]
        print("Randomly choose y labels for class k and set them to the noisy labels = ",idx_flip)
        if len(idx_flip) and len(noise) and len(idx_flip) >= len(
               noise):  # pragma: no cover
            s[np.random.choice(idx_flip, len(noise), replace=False)] = noise
        print("noisy labels = ",s)
        print("------------------------------------------")

    # Validate that s indeed produces the correct noise_matrix (or close to it)
    # Compute the actual noise matrix induced by s
    # counts = confusion_matrix(s, y).astype(float)
    # new_noise_matrix = counts / counts.sum(axis=0)
    # assert(np.linalg.norm(noise_matrix - new_noise_matrix) <= 2)

    return s

In [4]:
import numpy as np

In [5]:
noise_matrix = np.array([[.7,.2,.04,.03,.03],
                         [.2,.7,.04,.03,.03],
                         [.05,.01,.68,.13,.13],
                         [.01,.02,.12,.7,.15],
                         [.04,.04,.17,.05,.7]])

np.sum(noise_matrix,1) == 1

array([ True,  True,  True,  True,  True])

In [6]:
y = np.array([2, 0, 3, 1, 4, 1, 4, 1, 1, 4])

In [3]:
from __future__ import print_function, absolute_import, division, \
    unicode_literals, with_statement
import numpy as np
from cleanlab.util import value_counts, confusion_matrix
import warnings

In [10]:
generate_noisy_labels(y, noise_matrix)

------------------------------------------
k =  0
noisy labels that have non-zero counts =  []
how many of each noisy label we need to flip =  []
list of new noisy labels =  []
Randomly choose y labels for class k and set them to the noisy labels =  [1]
noisy labels =  [2 0 3 1 4 1 4 1 1 4]
------------------------------------------
k =  1
noisy labels that have non-zero counts =  []
how many of each noisy label we need to flip =  []
list of new noisy labels =  []
Randomly choose y labels for class k and set them to the noisy labels =  [3 5 7 8]
noisy labels =  [2 0 3 1 4 1 4 1 1 4]
------------------------------------------
k =  2
noisy labels that have non-zero counts =  []
how many of each noisy label we need to flip =  []
list of new noisy labels =  []
Randomly choose y labels for class k and set them to the noisy labels =  [0]
noisy labels =  [2 0 3 1 4 1 4 1 1 4]
------------------------------------------
k =  3
noisy labels that have non-zero counts =  []
how many of each noisy 

array([2, 0, 3, 1, 4, 1, 4, 1, 1, 4])

In [5]:
s = np.array([2, 0, 3, 1, 4, 1, 4, 1, 1, 4])
psx = np.array([[6.54160162e-04, 3.03065872e-06, 9.27881896e-01, 3.08100454e-04,
         7.11528882e-02],
        [9.52379167e-01, 2.47183759e-02, 1.75267160e-02, 5.93801611e-04,
         4.78204014e-03],
        [2.07699835e-09, 4.27654939e-10, 5.88927560e-06, 9.99994159e-01,
         4.86104665e-17],
        [8.21316838e-01, 1.68881238e-01, 7.79483374e-03, 2.00584810e-03,
         1.22517827e-06],
        [2.54394426e-08, 3.22418398e-10, 1.06855035e-01, 4.39613643e-08,
         8.93144906e-01],
        [4.56457287e-01, 5.43138981e-01, 2.59662047e-04, 1.44051432e-04,
         6.48661169e-09],
        [2.46501830e-03, 1.09589223e-06, 2.93296985e-02, 1.29379432e-05,
         9.68191206e-01],
        [1.82137087e-01, 8.17485034e-01, 3.06710252e-04, 7.09961532e-05,
         2.27663620e-07],
        [5.11904418e-01, 4.87983733e-01, 8.93332399e-05, 2.25182412e-05,
         1.91881022e-09],
        [4.90616987e-13, 2.53091194e-11, 2.81081963e-02, 7.72716169e-10,
         9.71891820e-01]])

In [9]:
s,psx.shape

(array([2, 0, 3, 1, 4, 1, 4, 1, 1, 4]), (10, 5))

In [1]:
def get_noise_indices(
        s,
        psx,
        frac_noise=1.0,
        prune_method='prune_by_noise_rate',
        verbose=0,
):
    """Returns the indices of most likely (confident) label errors in s. The
    number of indices returned is specified by frac_of_noise. When
    frac_of_noise = 1.0, all "confident" estimated noise indices are returned.
    * If you encounter the error 'psx is not defined', try setting n_jobs = 1.
    Parameters
    ----------
    s : np.array
      A binary vector of labels, s, which may contain mislabeling. "s" denotes
      the noisy label instead of \\tilde(y), for ASCII encoding reasons.
    psx : np.array (shape (N, K))
      P(s=k|x) is a matrix with K (noisy) probabilities for each of the N
      examples x.
      This is the probability distribution over all K classes, for each
      example, regarding whether the example has label s==k P(s=k|x).
      psx should have been computed using 3+ fold cross-validation.
    prune_method : str (default: 'prune_by_noise_rate')
      Possible Values: 'prune_by_class', 'prune_by_noise_rate', or 'both'.
      Method used for pruning.
      1. 'prune_by_noise_rate': works by removing examples with
      *high probability* of being mislabeled for every non-diagonal
      in the prune_counts_matrix (see pruning.py).
      2. 'prune_by_class': works by removing the examples with *smallest
      probability* of belonging to their given class label for every class.
      3. 'both': Finds the examples satisfying (1) AND (2) and
      removes their set conjunction.
    """
    
    # Number of classes s
    K = len(psx.T)
    # Boolean set to true if dataset is large
    big_dataset = K * len(s) > 1e8
    # Ensure labels are of type np.array()
    s = np.asarray(s)

    if confident_joint is None:
        from cleanlab.latent_estimation import compute_confident_joint
        confident_joint = compute_confident_joint(
            s=s,
            psx=psx,
            multi_label=multi_label,
        )

    # Leave at least MIN_NUM_PER_CLASS examples per class.
    # NOTE prune_count_matrix is transposed (relative to confident_joint)
    prune_count_matrix = keep_at_least_n_per_class(
        prune_count_matrix=confident_joint.T,
        n=MIN_NUM_PER_CLASS,
        frac_noise=frac_noise,
    )

    if num_to_remove_per_class is not None:
        # Estimate joint probability distribution over label errors
        psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
        noise_per_s = psy.sum(axis=1) - psy.diagonal()
        # Calibrate s.t. noise rates sum to num_to_remove_per_class
        tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
        np.fill_diagonal(tmp, s_counts - num_to_remove_per_class)
        prune_count_matrix = round_preserving_row_totals(tmp)

    if n_jobs > 1:  # Prepare multiprocessing shared data
        if multi_label:
            _s = RawArray('I', int2onehot(s).flatten())
        else:
            _s = RawArray('I', s)
        _s_counts = RawArray('I', s_counts)
        _prune_count_matrix = RawArray(
            'I', prune_count_matrix.flatten())
        _psx = RawArray(
            'f', psx.flatten())
    else:  # Multiprocessing is turned off. Create tuple with all parameters
        args = (s, s_counts, prune_count_matrix, psx, multi_label)

    # Perform Pruning with threshold probabilities from BFPRT algorithm in O(n)
    # Operations are parallelized across all CPU processes
    if prune_method == 'prune_by_class' or prune_method == 'both':
        if n_jobs > 1:  # parallelize
            with multiprocessing_context(
                    n_jobs,
                    initializer=_init,
                    initargs=(_s, _s_counts, _prune_count_matrix,
                              prune_count_matrix.shape, _psx, psx.shape,
                              multi_label),
            ) as p:
                if verbose:
                    print('Parallel processing label errors by class.')
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    noise_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_class, range(K)), total=K),
                    )
                else:
                    noise_masks_per_class = p.map(_prune_by_class, range(K))
        else:  # n_jobs = 1, so no parallelization
            noise_masks_per_class = [_prune_by_class(k, args) for k in range(K)]
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask_by_class = label_errors_mask

    if prune_method == 'prune_by_noise_rate' or prune_method == 'both':
        if n_jobs > 1:  # parallelize
            with multiprocessing_context(
                    n_jobs,
                    initializer=_init,
                    initargs=(_s, _s_counts, _prune_count_matrix,
                              prune_count_matrix.shape, _psx, psx.shape,
                              multi_label),
            ) as p:
                if verbose:
                    print('Parallel processing label errors by noise rate.')
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    noise_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K)
                    )
                else:
                    noise_masks_per_class = p.map(_prune_by_count, range(K))
        else:  # n_jobs = 1, so no parallelization
            noise_masks_per_class = [_prune_by_count(k, args) for k in range(K)]
        label_errors_mask = np.stack(noise_masks_per_class).any(axis=0)

    if prune_method == 'both':
        label_errors_mask = label_errors_mask & label_errors_mask_by_class

In [9]:
import cleanlab
from cleanlab.noise_generation import generate_noisy_labels
from cleanlab.util import print_noise_matrix
from cleanlab import baseline_methods
from cleanlab.latent_estimation import compute_confident_joint
from cleanlab import baseline_methods

In [13]:
def baseline_argmax(psx, s):
    '''This is the simplest baseline approach. Just consider 
    anywhere argmax != s as a label error.
    Parameters
    ----------
    s : np.array
        A discrete vector of noisy labels, i.e. some labels may be erroneous.
    psx : np.array (shape (N, K))
        P(label=k|x) is a matrix with K (noisy) probabilities for each of the
        N examples x. This is the probability distribution over all K classes,
        for each example, regarding whether the example has label s==k P(s=k|x).
        psx should have been computed using 3 (or higher) fold cross-validation.
    Returns
    -------
        A boolean mask that is true if the example belong
        to that index is label error..'''
    
    return np.argmax(psx, axis=1) != np.asarray(s)

In [12]:
label_error_mask = np.zeros(len(s), dtype=bool)
label_error_indices = compute_confident_joint(
    s, psx, return_indices_of_off_diagonals=True)[1]
baseline_conf_joint_only = label_error_mask    

# Method: C_confusion
baseline_argmax = baseline_methods.baseline_argmax(psx, s)

# Method: CL: PBC
baseline_cl_pbc = cleanlab.pruning.get_noise_indices(
    s, psx, prune_method='prune_by_class')

# Method: CL: PBNR
baseline_cl_pbnr = cleanlab.pruning.get_noise_indices(
            s, psx, prune_method='prune_by_noise_rate')

# Method: CL: C+NR
baseline_cl_both = cleanlab.pruning.get_noise_indices(
    s, psx, prune_method='both')

clean_labels = {
        'conf_joint_only': ~baseline_conf_joint_only,
        'pruned_argmax': ~baseline_argmax,
        'cl_pbc': ~baseline_cl_pbc,
        'cl_pbnr': ~baseline_cl_pbnr,
        'cl_both': ~baseline_cl_both,
    }