Data Directory

In [1]:
data_dir = "C:\\Users\\Small\\OneDrive\\Documents\\Cleanlab Testing\\chest_xray\\train"

ALED Functions

In [2]:
import sklearn
from sklearn.covariance import MinCovDet

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
import scipy.stats

import torch
import copy
import numpy as np
import pandas as pd
import os
from os import listdir
from torch.utils.data import DataLoader
import torch.nn as nn

# IMPORTANT FOR REPEATABILITY
SEED = 456
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

class ALED(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
    # BaseEstimator provides get and set_params() functions and ClassifierMixin provides weighted accuracy

    def __init__(self, random_state = 0): # random_state is default 0 for now, should be changed later!
        # Note: I think the sklearn API would say that some fit() params [e.g., model, max_pca_components, max_pca_variance]
        #   should be instantiation params and not fit() params

        self.random_state = random_state

    @staticmethod
    def sum_until(in_list, threshold):
        """
        Helper function for fit()
        """
        count = 0
        for n in range(len(in_list)):
            if count < threshold:
                count += in_list[n]
            else:
                break
        return count, n+1

    @staticmethod
    def gaussian_likelihood(cov_matrix, mahalanobis_squared_dist):
        """
        Helper function for fit()
        """
        n = cov_matrix.shape[0]
        sqrt_det = np.sqrt(np.linalg.det(cov_matrix))
        prob_x_f = ( 1 / ( (2*np.pi)**(n/2) * sqrt_det ) ) * np.exp(-0.5*mahalanobis_squared_dist)
        #if prob_x_f > 1: print("prob_x_f > 1; prob_x_f =", prob_x_f)
        return prob_x_f

    def extract_conv_net_features(self, conv_net, dataset):
        """
        Helper function for fit()
        """
        pass

    def map_conv_net_output(self):
        """
        Map the last layer of the convolutional layer (or other last feature extraction layer)
        so each sample ends up with 1 dimension of representative features
        """
        pass


    def extract_out_label(self, prediction_stats_row):
        given_label = prediction_stats_row['given label (name)']
        probabilities_row = np.array([prediction_stats_row["p(k = {} | x)".format(class_i)] for class_i in self.classes_])
        if (self.classes_[probabilities_row.argmax()] != given_label)\
           and (probabilities_row.max()/prediction_stats_row["p(k = {} | x)".format(given_label)] > self.likelihood_ratio_threshold):
            return self.classes_[probabilities_row.argmax()]
        else:
            return given_label

    def fit_predict(self, model, dataset, device=None, max_pca_components=10, max_pca_variance=0.25, likelihood_ratio_threshold=2, prob_method="gaussian", batch_size=100):
        # sklearn would want us to separate into fit() and predict() (NR)
        """
        Given a model and data, outputs predicted class for each sample based on ALED algorithm.
        Creates a new model that aggregates input classification model features / feature maps at last layer before classification into a [p x 1] vector.
        Then performs PCA to generate an [n x 1] vector of PCs, where n is specified either explicitly or by a desired level of explained variance. Then
        the probability that a given sample belongs to the assigned class is assessed using Bayes' rule, and if the probability falls below the within
        class threshold, then the sample is compared to the other class(es), and if the probability that it belongs in another class is higher than the
        out of class threshold, then the sample is added to an output DataFrame containing suspect samples and their associated probabilities. This
        function is dependent on the ability of the input classification model to extract salient features from the input data and thus should only be
        applied if the model is achieving some threshold auc / accuracy.

        ### Parameters
        1. model : Pytorch model (torch.nn.Sequential)
            -A classification model
        2. dataset : Pytorch Dataset (torch.utils.data.Dataset, eventually...right now needs an ImageFolder)
            -The dataset used to train model
             note: the dataset is responsible for holding any transformation functions;
             the transformations need to be the same transformations done for model evaluation; data augmentation should NOT be used!!!
        x. device : str, or None
            -Specifies device (e.g. 'cuda' GPU) to be used for processing
        4. max_pca_components : int
            -Maximum number of components used for PCA (10 by default);
             determines
        5. pca_variance : float
            -Ratio of explained variance desired, if
             using explained variance method (0.25 by
             default)
        6. wc_threshold : float
            -Within class threshold to consider sample
             within distribution of other class(es) (0.05
             by default)
        7. ooc_threshold : float
            -Out of class threshold to see whether flagged
             samples belong to other class(es) (0.95 by
             default)
        8. prob_method : str
            -Method to use for calculating the likelihood,
             either specify a distribution type or use a
             non-parametric estimator ("gaussian" is
             default)
        9. data_transforms : func
            -Any data transformations performed prior to use
             by the model # specify torchvision transforms (NR)

        ### Returns
        1. label_issues_df : Pandas DataFrame (pd.DataFrame)
            -A DataFrame containing the samples that are
             suspected label errors and their associated
             probabilities of belonging to each class,
             as calculated by the above methods

        ### Raises
        ______
        We'll get to that

        ### object attributes created:
        - self.conv_net
        - self.X_
        - self.y_
        - self.classes_
        - self.pca
        - self.num_pca_components
        - self.pca_explained_variance
        - self.X_pca_tf
        - self.c
        - self.prediction_stats

        """
        # Define Adaptive Labeling Error Detection (ALED) feature extraction model
        # I changed "ALED_model" to "self.conv_net"; please don't kill me; I think CNN, conv_net, or FEM are better names (NR)
        
        self.conv_net = nn.Sequential(*list(copy.deepcopy(model).eval().children())[:-1])
        # self.conv_net
        # I think this^ won't work for all models; maybe we do a deep recursive dive into model.modules() or
        #    model.children() to identify the last layer of type conv (NR)

        self.conv_net.avgpool = torch.nn.AdaptiveAvgPool2d((1,1)) # Feature pooling of feature maps, probably will need to add another method for 1d and 3d features
        print(self.conv_net)
        # Create sorted representation of data (right now only works on images in chest x-ray dataset):

        BATCH_SIZE = batch_size # maybe a param?
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

        feat_array = None # will be initialized once we know size of
        y = np.zeros(len(dataset))
        with torch.no_grad():
            for batch_num, (inputs, labels) in enumerate(dataloader):
                # note: the dataset is responsible for holding any transformation functions;
                # the transformations need to be the same transformations done for model evaluation; data augmentation should NOT be used!!!

                batch_cuda = inputs.to(device)

                batch_feat = self.conv_net(batch_cuda)
                batch_feat_cpu = batch_feat.to('cpu').squeeze() # this might need to be changed - can we assume user wants to use 'cpu'?
                # return batch_feat_cpu
                if feat_array is None:
                    feat_array = np.zeros((len(dataset), *batch_feat_cpu.shape[1:]))

                feat_array[batch_num*BATCH_SIZE : batch_num*BATCH_SIZE + len(batch_feat_cpu)] = batch_feat_cpu
                y[         batch_num*BATCH_SIZE : batch_num*BATCH_SIZE + len(batch_feat_cpu)] = labels

                del batch_cuda
                del batch_feat

        # feat_array.squeeze()

        print("feat_array.shape, y.shape:", feat_array.shape, y.shape)
        # Check that X and y have correct shape
        feat_array, y = check_X_y(feat_array, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = feat_array
        self.y_ = y
#         self.sample_weights = sample_weight # might need sample_weight param for compatibility with sklearn - document if ignored

        # perform PCA
        from sklearn.decomposition import PCA
        self.pca = PCA(n_components = max_pca_components) # don't rename this - we overwrite this variable 3 lines later
        self.pca.fit(feat_array)
        calc_var, max_variance_num_comps = self.sum_until(self.pca.explained_variance_ratio_, max_pca_variance)
        self.pca = PCA(n_components = min(max_pca_components, max_variance_num_comps)).fit(feat_array)
        self.X_pca_tf = self.pca.transform(feat_array) # pca_tf means pca transformed

        # instead of PCA, consider sklearn.random_projection.GaussianRandomProjection - maybe more robust to noise (NR)
        # also, random_projection.johnson_lindenstrauss_min_dim(...) finds a 'safe' number of components to randomly project to.


        # optional (could be helpful for user):
        self.num_pca_components = self.pca.n_components_
        self.pca_explained_variance = calc_var
        print("number of pca components:", self.num_pca_components)
        print("PCA explained variance:", self.pca_explained_variance)

        # Calculate covariance matrices for all class distributions
        self.num_classes = len(self.classes_)
        self.class_indices_dict = {}
        self.priors_dict = {}
        self.PC_dict = {}
        self.cov_dict = {}
        self.mvn_distributions = {}

        for class_i in self.classes_:
            class_indices = np.array(np.arange(len(feat_array))[y == class_i])
            self.class_indices_dict["class{}_indices".format(class_i)] = class_indices
            class_PC_array = self.X_pca_tf[class_indices, :]
            self.PC_dict["class{}_PC_array".format(class_i)] = class_PC_array
            self.cov_dict["robust_cov{}".format(class_i)] = MinCovDet(random_state=self.random_state).fit(class_PC_array)
            # self.mvn_distributions["mvn_{}".format(class_i)] = scipy.stats.multivariate_normal(mean=class_PC_array.mean(axis=0), cov=np.cov(class_PC_array, rowvar=False), seed=self.random_state) # this line from chatgpt, so could be good to double check the cov matrix is right (NR)

        # Calculate prior probabilities
        for class_i in self.classes_:
            prior = len(self.class_indices_dict["class{}_indices".format(class_i)]) / len(y)
            self.priors_dict["prior_prob{}".format(class_i)] = prior

        # Calculate likelihoods
        likelihoods_df_dict = {}
        for class_i in self.classes_:
            likelihoods_dict = {}
            for class_j in self.classes_:
                cov_matrix = self.cov_dict["robust_cov{}".format(class_j)].covariance_ #raw_covariance_ # why do we use raw_covariance_ instead of covariance_? (NR)
                cov_mahalanobis = self.cov_dict["robust_cov{}".format(class_j)].mahalanobis(self.PC_dict["class{}_PC_array".format(class_i)]) # I think this was redundant; see MinCovDet.dist_ (NR)
                likelihoods_dict["p(x | k = {})".format(class_j)] = np.array([self.gaussian_likelihood(cov_matrix, sample) for sample in cov_mahalanobis])

                # mahalanobis_distances = self.cov_dict["robust_cov{}".format(class_j)].dist_
                # likelihoods_dict["p(x | k = {})".format(class_j)] = np.array([self.gaussian_likelihood(cov_matrix, sample**2) for sample in mahalanobis_distances])

                # likelihoods = self.mvn_distributions["mvn_{}".format(class_j)].pdf(self.PC_dict["class{}_PC_array".format(class_i)])
                # class_j_pca_tf = self.PC_dict["class{}_PC_array".format(class_j)]
                # likelihoods = scipy.stats.multivariate_normal.pdf(self.PC_dict["class{}_PC_array".format(class_i)], mean=class_j_pca_tf.mean(axis=0), cov=np.cov(class_j_pca_tf, rowvar=False))
                # likelihoods_dict["p(x | k = {})".format(class_j)] = likelihoods

            likelihoods_df = pd.DataFrame.from_dict(likelihoods_dict).set_index(self.class_indices_dict["class{}_indices".format(class_i)])
            likelihoods_df_dict["likelihood_df{}".format(class_i)] = likelihoods_df

        # Calculate probabilities
        final_dfs_list = []
        for class_i in self.classes_:
            likelihoods_df = likelihoods_df_dict["likelihood_df{}".format(class_i)]
            for class_j in self.classes_:
                likelihoods_df["p(k = {} | x)".format(class_j)] = likelihoods_df["p(x | k = {})".format(class_j)] * self.priors_dict["prior_prob{}".format(class_j)] / \
                    sum( [likelihoods_df["p(x | k = {})".format(class_k)] * self.priors_dict["prior_prob{}".format(class_k)] for class_k in self.classes_] )
#             likelihoods_df["given label (num)"] = n
            likelihoods_df["ooc prob"] = likelihoods_df.iloc[:,len(self.classes_):].drop("p(k = {} | x)".format(class_i), axis=1).sum(axis=1)
            likelihoods_df["given label (name)"] = class_i
            # likelihoods_df["updated label (name)"] =
            final_dfs_list.append(likelihoods_df)

        self.prediction_stats = pd.concat(final_dfs_list).sort_index()
        self.likelihood_ratio_threshold = likelihood_ratio_threshold
        self.prediction_stats['Aled label'] = self.prediction_stats.apply(self.extract_out_label, axis=1)

        return self.prediction_stats['Aled label']

GPU-Conserving Attempt

In [3]:
def sum_until(in_list, threshold):
    """
    Helper function for find_label_issues
    """
    count = 0
    for n in range(len(in_list)):
        if count < threshold:
            count += in_list[n]
        else:
            break
    return count, n+1

def gaussian_likelihood(cov_matrix, mahalanobis_square_dist):
    """
    Helper function for find_label_issues
    """
    n = cov_matrix.shape[0]
    sqrt_det = np.sqrt(np.linalg.det(cov_matrix))
    prob_x_f = ( 1 / ( (2*np.pi)**(n/2) * sqrt_det ) ) * np.exp(-mahalanobis_square_dist)
    return prob_x_f

def find_label_issues(model, dataset, device=None, max_pca_components=10, max_pca_variance=0.25, likelihood_ratio_threshold=2, prob_method="gaussian", batch_size=4):
    """
    Creates a new model that aggregates input classification model features / feature maps at last layer before classification into a [p x 1] vector. 
    Then performs PCA to generate an [n x 1] vector of PCs, where n is specified either explicitly or by a desired level of explained variance. Then
    the probability that a given sample belongs to the assigned class is assessed using Bayes' rule, and if the probability falls below the within 
    class threshold, then the sample is compared to the other class(es), and if the probability that it belongs in another class is higher than the 
    out of class threshold, then the sample is added to an output DataFrame containing suspect samples and their associated probabilities. This
    function is dependent on the ability of the input classification model to extract salient features from the input data and thus should only be
    applied if the model is achieving some threshold auc / accuracy. 

    ### Parameters
    1. model : Pytorch model (torch.nn.Sequential)
        -A classification model
    2. data : Pytorch Dataset (torch.utils.data.Dataset, eventually...right now needs an ImageFolder)
        -The dataset used to train model
    3. pca_method : str
        -Use this string to specify whether to
         use the first n principal components 
         or to use as many components as needed
         to achieve a target explained variance
         ("variance" for variance method (default)
         or "components" for components method)
    4. pca_components : int
        -Number of components if using component
         method (None by default)
    5. pca_variance : float
        -Ratio of explained variance desired, if
         using explained variance method (0.25 by
         default)
    6. wc_threshold : float
        -Within class threshold to consider sample
         within distribution of other class(es) (0.05
         by default)
    7. ooc_threshold : float
        -Out of class threshold to see whether flagged
         samples belong to other class(es) (0.95 by
         default)
    8. prob_method : str
        -Method to use for calculating the likelihood,
         either specify a distribution type or use a 
         non-parametric estimator ("gaussian" is 
         default)
    9. data_transforms : func
        -Any data transformations performed prior to use 
         by the model
    
    ### Returns
    1. label_issues_df : Pandas DataFrame (pd.DataFrame)
        -A DataFrame containing the samples that are
         suspected label errors and their associated
         probabilities of belonging to each class, 
         as calculated by the above methods
    
    Raises
    
    We'll get to that

    """
    # Define Adaptive Labeling Error Detection (ALED) feature extraction model
    ALED_model = nn.Sequential(*list(model.children())[:-1])
    ALED_model.avgpool = torch.nn.AdaptiveAvgPool2d((1,1)) # Feature pooling of feature maps, probably will need to add another method for 1d and 3d features

    BATCH_SIZE = batch_size # maybe a param?
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

    feat_array = None # will be initialized once we know size of
    y = np.zeros(len(dataset))

    for batch_num, (inputs, labels) in enumerate(dataloader):
        # note: the dataset is responsible for holding any transformation functions;
        # the transformations need to be the same transformations done for model evaluation; data augmentation should NOT be used!!!

        batch_cuda = inputs.to(device)

        batch_feat = ALED_model(batch_cuda)
        batch_feat_cpu = batch_feat.to('cpu').squeeze() # this might need to be changed - can we assume user wants to use 'cpu'?
        # return batch_feat_cpu
        if feat_array is None:
            feat_array = np.zeros((len(dataset), *batch_feat_cpu.shape[1:]))

        feat_array[batch_num*BATCH_SIZE : batch_num*BATCH_SIZE + len(batch_feat_cpu)] = batch_feat_cpu
        y[         batch_num*BATCH_SIZE : batch_num*BATCH_SIZE + len(batch_feat_cpu)] = labels

        del batch_cuda
        del batch_feat

    # Store the classes seen during fit
    classes_ = unique_labels(y)

    X_ = feat_array
    y_ = y
#         self.sample_weights = sample_weight # might need sample_weight param for compatibility with sklearn - document if ignored

    # perform PCA
    from sklearn.decomposition import PCA
    pca = PCA(n_components = max_pca_components) # don't rename this - we overwrite this variable 3 lines later
    pca.fit(feat_array)
    calc_var, max_variance_num_comps = sum_until(pca.explained_variance_ratio_, max_pca_variance)
    pca = PCA(n_components = min(max_pca_components, max_variance_num_comps)).fit(feat_array)
    X_pca_tf = pca.transform(feat_array) # pca_tf means pca transformed

    # instead of PCA, consider sklearn.random_projection.GaussianRandomProjection - maybe more robust to noise (NR)
    # also, random_projection.johnson_lindenstrauss_min_dim(...) finds a 'safe' number of components to randomly project to.


    # optional (could be helpful for user):
    num_pca_components = pca.n_components_
    pca_explained_variance = calc_var
    print("number of pca components:", num_pca_components)
    print("PCA explained variance:", pca_explained_variance)

    # Calculate covariance matrices for all class distributions
    num_classes = len(classes_)
    class_indices_dict = {}
    priors_dict = {}
    PC_dict = {}
    cov_dict = {}
    mvn_distributions = {}

    for class_i in classes_:
        class_indices = np.array(np.arange(len(feat_array))[y == class_i])
        class_indices_dict["class{}_indices".format(class_i)] = class_indices
        class_PC_array = X_pca_tf[class_indices, :]
        PC_dict["class{}_PC_array".format(class_i)] = class_PC_array
        cov_dict["robust_cov{}".format(class_i)] = MinCovDet(random_state=0).fit(class_PC_array)
        # self.mvn_distributions["mvn_{}".format(class_i)] = scipy.stats.multivariate_normal(mean=class_PC_array.mean(axis=0), cov=np.cov(class_PC_array, rowvar=False), seed=self.random_state) # this line from chatgpt, so could be good to double check the cov matrix is right (NR)

    # Calculate prior probabilities
    for class_i in classes_:
        prior = len(class_indices_dict["class{}_indices".format(class_i)]) / len(y)
        priors_dict["prior_prob{}".format(class_i)] = prior

    # Calculate likelihoods
    likelihoods_df_dict = {}
    for class_i in classes_:
        likelihoods_dict = {}
        for class_j in classes_:
            cov_matrix = cov_dict["robust_cov{}".format(class_j)].covariance_ #raw_covariance_ # why do we use raw_covariance_ instead of covariance_? (NR)
            cov_mahalanobis = cov_dict["robust_cov{}".format(class_j)].mahalanobis(PC_dict["class{}_PC_array".format(class_i)]) # I think this was redundant; see MinCovDet.dist_ (NR)
            likelihoods_dict["p(x | k = {})".format(class_j)] = np.array([gaussian_likelihood(cov_matrix, sample) for sample in cov_mahalanobis])

        likelihoods_df = pd.DataFrame.from_dict(likelihoods_dict).set_index(class_indices_dict["class{}_indices".format(class_i)])
        likelihoods_df_dict["likelihood_df{}".format(class_i)] = likelihoods_df

    # Calculate probabilities
    final_dfs_list = []
    for class_i in classes_:
        likelihoods_df = likelihoods_df_dict["likelihood_df{}".format(class_i)]
        for class_j in classes_:
            likelihoods_df["p(k = {} | x)".format(class_j)] = likelihoods_df["p(x | k = {})".format(class_j)] * priors_dict["prior_prob{}".format(class_j)] / \
                sum( [likelihoods_df["p(x | k = {})".format(class_k)] * priors_dict["prior_prob{}".format(class_k)] for class_k in classes_] )
        likelihoods_df["ooc prob"] = likelihoods_df.iloc[:,len(classes_):].drop("p(k = {} | x)".format(class_i), axis=1).sum(axis=1)
        likelihoods_df["given label (name)"] = class_i
        final_dfs_list.append(likelihoods_df)

    def extract_out_label(prediction_stats_row):
        given_label = prediction_stats_row['given label (name)']
        probabilities_row = np.array([prediction_stats_row["p(k = {} | x)".format(class_i)] for class_i in classes_])
        if (classes_[probabilities_row.argmax()] != given_label)\
            and (probabilities_row.max()/prediction_stats_row["p(k = {} | x)".format(given_label)] > likelihood_ratio_threshold):
            return classes_[probabilities_row.argmax()]
        else:
            return given_label
        
    prediction_stats = pd.concat(final_dfs_list).sort_index()
    likelihood_ratio_threshold = likelihood_ratio_threshold
    prediction_stats['Aled label'] = prediction_stats.apply(extract_out_label, axis=1)

    return prediction_stats

Model-Related Functions

In [4]:
from torchvision import datasets
from torchvision.transforms import ToTensor

def generate(model, dataset, BATCH_SIZE):

    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Generate outputs from random model
    with torch.no_grad():
        for i, (images, labels) in enumerate(dataloader):
            images_cuda = images.to(device)

            output = model(images_cuda)

            del images_cuda

            yield output

def create_dataset_labels(outputs_generator):
    outputs_list = []
    for output in outputs_generator:
        output_cpu = output.cpu()
        outputs_list.append(output_cpu)
        del output
    outputs_tensor = torch.cat(outputs_list).squeeze()
    return outputs_tensor

def softmaxed_outputs_array(model, dataset, BATCH_SIZE):
    model_outputs = create_dataset_labels(generate(model, dataset, BATCH_SIZE))
    m = nn.Softmax(dim=1)
    softmaxed_array = m(model_outputs).numpy()
    return softmaxed_array

def CL_num_estimate(model, dataset, BATCH_SIZE):
  outs = softmaxed_outputs_array(model, dataset, BATCH_SIZE)
  thresholds_dict = {}
  for n in range(len(dataset.classes)):
      idx_n = np.where(np.array(dataset.labels) == n)[0]
      thresh_n = np.mean(outs[idx_n, n])
      thresholds_dict[n] = thresh_n

  predictions = np.argmax(outs, axis=1)
  df = pd.DataFrame(outs)
  df["label"] = np.array(dataset.labels)
  df["pred"] = predictions

  misclass_df = df[df["pred"] != df["label"]]
  misclass_df["CL_pred"] = misclass_df.apply(lambda row : row[row["pred"]] > thresholds_dict[row["pred"]], axis=1)

  num_estimate = sum(misclass_df["CL_pred"])

  return misclass_df, num_estimate

Model Training Functions

In [5]:
from torch.autograd import Variable

def train(num_epochs, cnn, loaders, optimizer, loss_func):

    cnn.train()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Train the model
    total_step = len(loaders['train'])

    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(loaders['train']):

            # gives batch data, normalize x when iterate train_loader
            b_x = Variable(images).to(device)   # batch x
            b_y = Variable(labels).to(device)   # batch y
            output = cnn(b_x)
            loss = loss_func(output, b_y)

            # clear gradients for this training step
            optimizer.zero_grad()

            # backpropagation, compute gradients
            loss.backward()
            # apply gradients
            optimizer.step()

            if (i+1) % 100 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                       .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))

Dataset Mislabeling

In [6]:
from sklearn.utils.multiclass import unique_labels
from torch.utils.data import Dataset
import copy
from torch.utils.data import Subset

class Mislabeling_Dataset(Dataset):
    """
    a Dataset class that contains a Dataset, providing the data of the internal Dataset
    but with labeling errors
    """
    def __init__(self, internal_dataset, internal_dataset_labels = None, fraction_mislabeled = 0.1, random_state = None):

        super().__init__()
        self.internal_dataset = copy.deepcopy(internal_dataset)
        self.true_labels = copy.deepcopy(internal_dataset_labels)
        if self.true_labels is None:
            self.set_true_labels()
        elif len(self.true_labels) != len(internal_dataset):
          raise Exception("Must have same number of labels as the length of the dataset [i.e. len(internal_dataset_labels) == len(internal_dataset)]")
        self.classes = unique_labels(self.true_labels)
        self.fraction_mislabeled = fraction_mislabeled
        self.rng = np.random.default_rng(random_state)
        # could potentially add a parameter to choose what method to mislabel (e.g., completely random, half mislabeled from each class)


        self.mislabel_sample_inds = self.rng.choice(np.arange(len(self.internal_dataset)), size=int(self.fraction_mislabeled*len(internal_dataset)), replace=False)
        mislabel_maps = [[class_j for class_j in self.classes if class_j != class_i] for class_i in self.classes]

        self.labels = copy.deepcopy(self.true_labels)
        for ind in self.mislabel_sample_inds:
            self.labels[ind] = mislabel_maps[self.true_labels[ind]][self.rng.integers(len(self.classes)-1)]


    def set_true_labels(self):
        # helper function used by __init__()
        self.true_labels = np.zeros(len(self.internal_dataset), dtype=object)
        for i in range(len(self.internal_dataset)):
            self.true_labels[i] = self.internal_dataset[i][1]

    def __getitem__(self, idx):
        try:
            return self.internal_dataset[idx][0], self.labels[idx].item()
        except:
            return self.internal_dataset[idx][0], self.labels[idx]

    def __len__(self):
        return len(self.internal_dataset)

class custom_Subset(Subset):
  def __init__(self, dataset, indices):
    self.classes = dataset.classes
    self.indices = indices
    self.dataset = dataset
    self.labels = dataset.labels[indices]
    self.true_labels = dataset.true_labels[indices]

ALED Looping Function

In [7]:
import sys, os
import warnings
from torch import optim

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

def ALED_Looper(cnn, mislabeled_data, num_epochs, num_ALED, batch_size, initial_train=True, num_est=None):
    
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(cnn.parameters(), lr = 0.01)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if initial_train:
        loaders = {
            'train' : torch.utils.data.DataLoader(mislabeled_data,
                                                batch_size=batch_size,
                                                shuffle=False,
                                                num_workers=0)
        }

        train(num_epochs, cnn, loaders, optimizer, loss_func)

        warnings.filterwarnings("ignore")

        m_df, num_est = CL_num_estimate(cnn, mislabeled_data, batch_size)

        warnings.filterwarnings("always")

    for n in range(num_ALED):
        print("ALED Starting")

        warnings.filterwarnings("ignore")
        
        with HiddenPrints():
            aled = ALED()
            prob_df = aled.fit_predict(model=cnn, dataset=mislabeled_data, device=device, batch_size=1000, max_pca_variance=0.99, max_pca_components=8)

        warnings.filterwarnings("always")

        posterior_probs_df = aled.prediction_stats.iloc[:,10:]
        indices_to_remove = posterior_probs_df.sort_values(by=["ooc prob"], axis=0, ascending=False).head(num_est).index.to_numpy()
        indices_to_keep = [i for i in posterior_probs_df.index.to_numpy() if i not in indices_to_remove]

        filtered_training_data = custom_Subset(mislabeled_data, indices_to_keep)
        removed_training_data = custom_Subset(mislabeled_data, indices_to_remove)

        posterior_probs_df["true labels"] = mislabeled_data.true_labels
        checking_df = posterior_probs_df.sort_values(by=["ooc prob"], axis=0, ascending=False).head(num_est)
        num_correct = len(checking_df[checking_df["true labels"] != checking_df["given label (name)"]])
        print("Num Est:", num_est)
        print("Num Correct:", num_correct)
        print("Percent Correct: ", round(100*num_correct/num_est, 2))

        print("ALED Done")
        loaders_ALED = {
            'train' : torch.utils.data.DataLoader(filtered_training_data,
                                                batch_size=batch_size,
                                                shuffle=False,
                                                num_workers=0)
        }

        train(num_epochs, cnn, loaders_ALED, optimizer, loss_func)

Cleanlab Analysis Function

In [8]:
def CL_Looper(cnn, mislabeled_data, num_epochs, batch_size):
    
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(cnn.parameters(), lr = 0.01)

    loaders = {
        'train' : torch.utils.data.DataLoader(mislabeled_data,
                                            batch_size=batch_size,
                                            shuffle=False,
                                            num_workers=0)
    }

    train(num_epochs, cnn, loaders, optimizer, loss_func)

    print("Starting Cleanlab")

    warnings.filterwarnings("ignore")

    m_df, num_est = CL_num_estimate(cnn, mislabeled_data, batch_size)

    warnings.filterwarnings("always")

    CL_array = m_df[m_df["CL_pred"]==True].index.to_numpy()
    truth_array = np.equal(np.array(mislabeled_data.true_labels), np.array(mislabeled_data.labels))
    truth_array = np.where(truth_array==False)
    num_correct = sum(np.in1d(CL_array, truth_array))
    print("Num Est:", num_est)
    print("Num Correct:", num_correct)
    print("Percent Correct: ", round(100*num_correct/num_est, 2))

    return num_est

Dataset Setup

In [9]:
import torchvision.transforms as T

data_T = T.Compose([
                
                T.Resize(size = (224,224)),
                T.ToTensor(),
                T.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
        ])

trainset = datasets.ImageFolder(data_dir, transform = data_T)

Model

In [10]:
import torchvision

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torchvision.models.densenet161(weights="DenseNet161_Weights.DEFAULT")

for param in model.parameters():
    param.requires_grad = False

in_features = model.classifier.in_features
model.classifier = nn.Linear(in_features, len(trainset.classes))
model = model.to(device)

In [11]:
mislabeled_train_data = Mislabeling_Dataset(trainset, internal_dataset_labels=trainset.targets, fraction_mislabeled=0.1, random_state=0)

Cleanlab Estimation

In [12]:
num_epochs = 2
batch_size = 4

num_est = CL_Looper(cnn=model, mislabeled_data=mislabeled_train_data, num_epochs=num_epochs, batch_size=batch_size)

Epoch [1/2], Step [100/1304], Loss: 0.0149
Epoch [1/2], Step [200/1304], Loss: 0.0000
Epoch [1/2], Step [300/1304], Loss: 0.0268
Epoch [1/2], Step [400/1304], Loss: 0.0097
Epoch [1/2], Step [500/1304], Loss: 0.6661
Epoch [1/2], Step [600/1304], Loss: 0.0013
Epoch [1/2], Step [700/1304], Loss: 1.4871
Epoch [1/2], Step [800/1304], Loss: 0.0035
Epoch [1/2], Step [900/1304], Loss: 2.8204
Epoch [1/2], Step [1000/1304], Loss: 0.6670
Epoch [1/2], Step [1100/1304], Loss: 0.0000
Epoch [1/2], Step [1200/1304], Loss: 0.0001
Epoch [1/2], Step [1300/1304], Loss: 3.2462
Epoch [2/2], Step [100/1304], Loss: 0.0105
Epoch [2/2], Step [200/1304], Loss: 0.0000
Epoch [2/2], Step [300/1304], Loss: 0.0236
Epoch [2/2], Step [400/1304], Loss: 0.0001
Epoch [2/2], Step [500/1304], Loss: 2.0114
Epoch [2/2], Step [600/1304], Loss: 0.0009
Epoch [2/2], Step [700/1304], Loss: 1.3286
Epoch [2/2], Step [800/1304], Loss: 0.0005
Epoch [2/2], Step [900/1304], Loss: 2.1368
Epoch [2/2], Step [1000/1304], Loss: 2.0623
Epoch 

ALED Estimation

In [13]:
num_ALED = 1

prob_df = find_label_issues(model, mislabeled_train_data, device=device, batch_size=batch_size)
prob_df["true labels"] = mislabeled_train_data.true_labels
checking_df = prob_df.sort_values(by=["ooc prob"], axis=0, ascending=False).head(num_est)
num_correct = len(checking_df[checking_df["true labels"] != checking_df["given label (name)"]])
print("Num Est:", num_est)
print("Num Correct:", num_correct)
print("Percent Correct:", round(100*num_correct/num_est, 2))

#output = ALED_Looper(cnn=model, mislabeled_data=mislabeled_train_data, num_epochs=num_epochs, num_ALED=num_ALED, batch_size=batch_size, initial_train=False, num_est=num_est)

number of pca components: 10
PCA explained variance: 0.24977704547712423
Num Est: 1623
Num Correct: 496
Percent Correct:  30.56
