## Packages

In [3]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import os

import collections
import random

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import load_img

import keras
from keras.models import Sequential,Input,Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.layers.experimental.preprocessing import Rescaling

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Lectura de les dades

In [4]:
print(tf. __version__)

2.6.0


In [5]:
os.getcwd()

'C:\\Users\\aidaf\\TFG'

In [6]:
data_dir = os.path.join(os.getcwd() + "\\lung_colon_image_set")
data_dir

'C:\\Users\\aidaf\\TFG\\lung_colon_image_set'

In [7]:
classes = os. listdir(data_dir)
print(classes)

['colon_aca', 'colon_n', 'lung_aca', 'lung_n', 'lung_scc']


In [8]:
dades_Y = np.load('labels.npy')
dades_X = np.load('images.npy')

## Descripció de les dades

Dimensió: 

In [9]:
print('Dimensions de les dades : ', dades_X.shape, dades_Y.shape)

Dimensions de les dades :  (25000, 100, 100, 3) (25000,)


Nombre de classes i etiquetes: 

In [10]:
n_classes = len(classes)
print('Nombre de classes : ', n_classes)
print('Etiquetes de les classes : ', classes)

Nombre de classes :  5
Etiquetes de les classes :  ['colon_aca', 'colon_n', 'lung_aca', 'lung_n', 'lung_scc']


Distribució de les classes: 

In [11]:
ocurrences=collections.Counter(dades_Y)

print("Distribució de les classes : ",ocurrences)

Distribució de les classes :  Counter({3: 5000, 1: 5000, 4: 5000, 2: 5000, 0: 5000})


## Pre-processing

One-hot:

In [12]:
dades_Y_one_hot = np.array(pd.get_dummies(dades_Y))
dades_Y_one_hot = dades_Y_one_hot.astype('float32')
dades_Y_one_hot.shape

(25000, 5)

In [13]:
dades_X.shape

(25000, 100, 100, 3)

Split validation-train:

In [14]:
train_X_aux,valid_X,train_label,valid_label = train_test_split(dades_X, dades_Y_one_hot, test_size=0.1, 
                                                           random_state=13,stratify=dades_Y_one_hot)

In [15]:
train_X_aux.shape,valid_X.shape,train_label.shape,valid_label.shape

((22500, 100, 100, 3), (2500, 100, 100, 3), (22500, 5), (2500, 5))

Mostrem que efectivament la mostra de validació està estratificada per classes:

In [16]:
ocurrences2=collections.Counter(np.argmax(np.round(valid_label),axis=1))

print("Distribució de les classes : ",ocurrences2)

Distribució de les classes :  Counter({0: 500, 4: 500, 2: 500, 1: 500, 3: 500})


## Methods to Standardize Research with Noisy Labels

In [30]:
# Initalization.

# Set the sparsity of the noise matrix.
FRAC_ZERO_NOISE_RATES = 0.0 # Consider increasing to 0.5
# A proxy for the fraction of labels that are correct.
avg_trace = 0.65 # ~35% wrong labels. Increasing makes the problem easier.
# Amount of data for each dataset.
dataset_size = 400 # Try 250 or 400 to use less or more data.
# Step size in the mesh.
h = .02  

py = np.bincount(dades_Y) / float(len(dades_Y))

In [43]:
from cleanlab.noise_generation import generate_noise_matrix_from_trace
from cleanlab.noise_generation import generate_noisy_labels
from cleanlab.util import print_noise_matrix

In [41]:
# Generate a noise matrix (guarantees learnability)
noise_matrix = generate_noise_matrix_from_trace(
    K = n_classes, 
    trace = n_classes * avg_trace,
    py = py,
    frac_zero_noise_rates = FRAC_ZERO_NOISE_RATES,
)

In [45]:
print_noise_matrix(noise_matrix)
np.random.seed(seed=1)
# Create the noisy labels. This method is exact w.r.t. the noise_matrix.
y_train_w_errors = generate_noisy_labels(dades_Y, noise_matrix)


 Noise Matrix (aka Noisy Channel) P(s|y) of shape (5, 5)
 p(s|y)	y=0	y=1	y=2	y=3	y=4
	---	---	---	---	---
s=0 |	0.6	0.02	0.1	0.13	0.03
s=1 |	0.12	0.95	0.09	0.08	0.05
s=2 |	0.03	0.01	0.7	0.02	0.35
s=3 |	0.06	0.02	0.0	0.59	0.16
s=4 |	0.19	0.01	0.12	0.19	0.42
	Trace(matrix) = 3.25



In [54]:
dades_Y_one_hot_noisy = np.array(pd.get_dummies(y_train_w_errors))
dades_Y_one_hot_noisy = dades_Y_one_hot_noisy.astype('float32')
dades_Y_one_hot_noisy.shape

(25000, 5)

In [63]:
train_X,valid_X,train_label_noisy,valid_label_noisy = train_test_split(dades_X, dades_Y_one_hot_noisy, 
                                                                           test_size=0.1, random_state=13,
                                                                           stratify=dades_Y_one_hot_noisy)
train_X.shape,valid_X.shape,train_label.shape,valid_label.shape

((22500, 100, 100, 3), (2500, 100, 100, 3), (22500, 5), (2500, 5))

In [64]:
model = Sequential()
model.add(Rescaling(1./255, input_shape=(dades_X.shape[1],dades_X.shape[2],3)))
model.add(Conv2D(32, kernel_size=(3, 3),activation='linear', padding='same'))
model.add(LeakyReLU(alpha=0.1))
model.add(MaxPooling2D((2, 2),padding='same'))
model.add(Conv2D(64, (3, 3), activation='linear',padding='same'))
model.add(LeakyReLU(alpha=0.1))
model.add(MaxPooling2D(pool_size=(2, 2),padding='same'))
model.add(Flatten())
model.add(Dense(64, activation='linear'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))
model.add(Dense(n_classes, activation='softmax'))

In [65]:
model.compile(loss=keras.losses.categorical_crossentropy, 
              optimizer=keras.optimizers.Adam(),metrics=['accuracy'])

In [66]:
# fit model on the augmented dataset
# initialize the number of epochs and batch size
EPOCHS = 10
BS = 256

# train the network
H = model.fit(train_X, train_label_noisy, batch_size=BS, epochs=EPOCHS,
              verbose=1, validation_data=(valid_X, valid_label))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
15/88 [====>.........................] - ETA: 4:15 - loss: 0.8733 - accuracy: 0.6685

KeyboardInterrupt: 

In [None]:
model.save("def_model_.h5")

def_model = keras.models.load_model('def_model.h5')
test_eval = def_model.evaluate(valid_X,valid_label, verbose=0)

print('Test loss:', test_eval[0])
print('Test accuracy:', test_eval[1])

In [None]:
accuracy = H.history['accuracy']
val_accuracy = H.history['val_accuracy']
loss = H.history['loss']
val_loss = H.history['val_loss']
epochs = range(len(accuracy))
plt.plot(epochs, accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'c', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'c', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
predicted_classes = def_model.predict(valid_X)

predicted_classes = np.argmax(np.round(predicted_classes),axis=1)
valid_labels = np.argmax(np.round(valid_label),axis=1)

target_names = ["Class {}".format(i) for i in range(21)]
print(classification_report(valid_labels, predicted_classes, target_names=classes))

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(16, 12))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
plot_confusion_matrix(confusion_matrix(valid_labels, predicted_classes), target_names = classes)