In [10]:
import numpy as np
import pandas as pd
import sys

from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten
from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, label_binarize
from sklearn.pipeline import Pipeline

from numba import njit, prange

In [2]:
@njit(fastmath=True, parallel=True)
def _compute_shapley_values(X_train, y_train, X_test, y_test, K=1):
    N = len(X_train)
    M = len(X_test)
    result = np.zeros(N, dtype=np.float32)

    for j in prange(M):
        score = np.zeros(N, dtype=np.float32)
        dist = np.zeros(N, dtype=np.float32)
        div_range = np.arange(1.0, N)
        div_min = np.minimum(div_range, K)
        for i in range(N):
            dist[i] = np.sqrt(np.sum(np.square(X_train[i] - X_test[j])))
        indices = np.argsort(dist)
        y_sorted = y_train[indices]
        eq_check = (y_sorted == y_test[j]) * 1.0
        diff = - 1 / K * (eq_check[1:] - eq_check[:-1])
        diff /= div_range
        diff *= div_min
        score[indices[:-1]] = diff
        score[indices[-1]] = eq_check[-1] / N
        score[indices] += np.sum(score[indices]) - np.cumsum(score[indices])
        result += score / M

    return result

In [3]:
def decode_image(img_str):
    return np.array([int(val) for val in img_str.split(':')])

def normalise_image(images):
    return images / 255.0


def reshape_images(images):
    return np.concatenate(images['image'].values) \
        .reshape(images.shape[0], 28, 28, 1)


def create_cnn():
    model = Sequential([
        Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(28, 28, 1)),
        MaxPooling2D(pool_size=2),
        Dropout(0.3),
        Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'),
        MaxPooling2D(pool_size=2),
        Dropout(0.3),
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(2, activation='softmax')
    ])

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [4]:
train_data = pd.read_csv(f'arguseyes/example_pipelines/datasets/sneakers/product_images.csv', converters={'image': decode_image})

product_categories = pd.read_csv('arguseyes/example_pipelines/datasets/sneakers/product_categories.csv')
with_categories = train_data.merge(product_categories, on='category_id')

categories_to_distinguish = ['Sneaker', 'Ankle boot']

images_of_interest = with_categories[with_categories['category_name'].isin(categories_to_distinguish)]


In [5]:
pipeline = Pipeline(steps=[
    ('normalisation', FunctionTransformer(normalise_image)),
    ('reshaping', FunctionTransformer(reshape_images)),
    ('model', KerasClassifier(create_cnn, epochs=10, verbose=0))
])

random_seed_for_splitting = 1337

train, test = train_test_split(images_of_interest, test_size=0.2, random_state=random_seed_for_splitting)

y_train = label_binarize(train['category_name'], classes=categories_to_distinguish)
y_test = label_binarize(test['category_name'], classes=categories_to_distinguish)



In [6]:
train['id'] = np.array(range(len(train)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['id'] = np.array(range(len(train)))


In [7]:
def run_exp(seed):
    np.random.seed(seed)
    y_train_dirty = np.copy(y_train)
    mislabeled_idx = np.random.choice(range(len(y_train_dirty)), replace=False, size=475)
    y_train_dirty[mislabeled_idx] = np.logical_not(y_train_dirty[mislabeled_idx])
    mislabeled_identifiers = set(train.iloc[mislabeled_idx]['id'])

    train_copy = train.copy(deep=True)
    test_copy = test.copy(deep=True)
    y_train_dirty_copy = np.copy(y_train_dirty)

    model = pipeline.fit(train_copy[['image']], y_train_dirty_copy)
    print('Initial acc: ', model.score(test_copy[['image']], y_test))

    for round in range(10):
        print('ROUND', round)

        X_train = model.steps[1][1].transform(model.steps[0][1].transform(train_copy[['image']]))
        X_test = model.steps[1][1].transform(model.steps[0][1].transform(test_copy[['image']]))

        s = 100 
        k = 10
        step_size = 50

        X_test_sampled = X_test[:s, :]
        y_test_sampled = y_test[:s, :]

        shapley_values = _compute_shapley_values(X_train, 
                                                 np.squeeze(y_train_dirty_copy),         
                                                 X_test_sampled,
                                                 np.squeeze(y_test_sampled), 
                                                 k)

        redundant_idx = np.argsort(shapley_values)[:step_size]
        chosen_identifiers = set(train_copy.iloc[redundant_idx]['id'])

        idx_to_keep = np.array([pos for pos in range(len(y_train_dirty_copy)) if pos not in redundant_idx])
        train_copy = train_copy.iloc[idx_to_keep]
        y_train_dirty_copy = y_train_dirty_copy[idx_to_keep]

        print('# Correctly identified', len(chosen_identifiers & mislabeled_identifiers))
        print('# samples', len(train_copy))

        model = pipeline.fit(train_copy[['image']], y_train_dirty_copy)

        print('Acc: ', model.score(test_copy[['image']], y_test))

    

In [30]:
!pip install cleanlab

Collecting cleanlab
  Downloading cleanlab-2.0.0-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 2.3 MB/s eta 0:00:011
Collecting tqdm>=4.53.0
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 10.4 MB/s eta 0:00:01
Installing collected packages: tqdm, cleanlab
Successfully installed cleanlab-2.0.0 tqdm-4.64.0
You should consider upgrading via the '/Users/ssc/projects/arguseyes/venv/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [51]:
def run_exp_noshapley(seed):
    np.random.seed(seed)
    y_train_dirty = np.copy(y_train)
    mislabeled_idx = np.random.choice(range(len(y_train_dirty)), replace=False, size=475)
    y_train_dirty[mislabeled_idx] = np.logical_not(y_train_dirty[mislabeled_idx])
    mislabeled_identifiers = set(train.iloc[mislabeled_idx]['id'])

    train_copy = train.copy(deep=True)
    test_copy = test.copy(deep=True)
    y_train_dirty_copy = np.copy(y_train_dirty)

    model = pipeline.fit(train_copy[['image']], y_train_dirty_copy)
    print('Initial acc: ', model.score(test_copy[['image']], y_test))

    
    X_train = model.steps[1][1].transform(model.steps[0][1].transform(train_copy[['image']]))
    X_test = model.steps[1][1].transform(model.steps[0][1].transform(test_copy[['image']]))
    
    #issues = CleanLearning(LogisticRegression, seed=seed).find_label_issues(data, labels)
    
    step_size=50
    
    for round in range(10):
        print('ROUND', round)
    
        probs = model.predict_proba(train_copy[['image']])
        diffs = abs(probs[:,0] - probs[:,1])
        redundant_idx = np.argsort(diffs)[:step_size]
        
        redundant_idx = np.random.choice(range(len(train_copy)), replace=False, size=step_size)
        chosen_identifiers = set(train_copy.iloc[redundant_idx]['id'])

        idx_to_keep = np.array([pos for pos in range(len(y_train_dirty_copy)) if pos not in redundant_idx])
        train_copy = train_copy.iloc[idx_to_keep]
        y_train_dirty_copy = y_train_dirty_copy[idx_to_keep]

        print('# Correctly identified', len(chosen_identifiers & mislabeled_identifiers))
        print('# samples', len(train_copy))

        model = pipeline.fit(train_copy[['image']], y_train_dirty_copy)

        print('Acc: ', model.score(test_copy[['image']], y_test))
    


In [52]:
run_exp_noshapley(1)

Initial acc:  0.5252100825309753
ROUND 0
# Correctly identified 30
# samples 900
Acc:  0.5714285969734192
ROUND 1
# Correctly identified 27
# samples 850
Acc:  0.5630252361297607
ROUND 2
# Correctly identified 25
# samples 800
Acc:  0.5714285969734192
ROUND 3
# Correctly identified 29
# samples 750
Acc:  0.529411792755127
ROUND 4
# Correctly identified 25
# samples 700
Acc:  0.5042017102241516
ROUND 5
# Correctly identified 24
# samples 650
Acc:  0.7478991746902466
ROUND 6
# Correctly identified 29
# samples 600
Acc:  0.7058823704719543
ROUND 7
# Correctly identified 25
# samples 550
Acc:  0.5462185144424438
ROUND 8
# Correctly identified 20
# samples 500
Acc:  0.5042017102241516
ROUND 9
# Correctly identified 26
# samples 450
Acc:  0.7184873819351196
