In [1]:
import tensorflow as tf
from model import *
from data_pipeline import *
from constants import *
import numpy as np
import matplotlib.pylab as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import os
import pandas as pd
import shutil
import glob
%matplotlib inline

In [38]:
# processing csv for easy processing
data = pd.read_csv('./Data/train/train-filtered.csv')
data.rename(columns = {'diabetic retinopathy': 'diabetic_retinopathy'}, inplace=True)
labels = dict(diabetic_retinopathy=0, glaucoma=1, normal=2 )
tmp = data[list(data.columns[1:])]
tmp['label_str'] = tmp.idxmax(axis=1)
data['label_str'] = tmp['label_str']
data['label'] = data['label_str'].apply(lambda x: labels[x])

In [3]:
model = tf.keras.models.load_model(MODEL_CHECKPOINT_DIR+'resnet50v2_retrain_8layers_lr_1e-3')

In [4]:
# grab the features learnt and create a new data set
training_data = []
for _, row in data.iterrows():
    file_path = os.path.join(DATA_DIR, row.filename)
    img = tf.keras.preprocessing.image.load_img(file_path, target_size=IMAGE_SIZE, interpolation='bilinear')
    img = tf.keras.preprocessing.image.img_to_array(img)
    encoded_features = model.layers[0].predict(np.array([img]))
    training_data.append((row.filename, encoded_features[0], row.label))

In [5]:
len(training_data)

1839

In [7]:
X_train = np.array([np.array(x[1]) for x in training_data])
y_train = np.array([np.array(x[2]) for x in training_data])
X_train.shape, y_train.shape, np.unique(y_train)

((1839, 2048), (1839,), array([0, 1, 2]))

In [9]:
# fit k-means on the encoding(features learnt)
from sklearn.cluster import MiniBatchKMeans
total_clusters = len(np.unique(y_train))
kmeans = MiniBatchKMeans(n_clusters = len(np.unique(y_train)))
kmeans.fit(X_train)

MiniBatchKMeans(n_clusters=3)

In [11]:
# prepage unlabelled data to get encodings
from pathlib import Path
unlabelled_data = []
unlabelled_data_dir = './Data/test/test/'
for filename in glob.glob(os.path.join(unlabelled_data_dir, '*.jpg')):
    img = tf.keras.preprocessing.image.load_img(filename, target_size=IMAGE_SIZE, interpolation='bilinear')
    img = tf.keras.preprocessing.image.img_to_array(img)
    encoded_features = model.layers[0].predict(np.array([img]))
    unlabelled_data.append((Path(filename).name, encoded_features[0]))

In [12]:
len(unlabelled_data)

350

In [13]:
X_test = np.array([np.array(x[1]) for x in unlabelled_data])
y_pred = kmeans.predict(X_test)

In [14]:
np.unique(y_pred)

array([0, 1, 2], dtype=int32)

In [39]:
# creating new csv with labels for previous unlabelled data
inv_labels = {v: k for k, v in labels.items()}
tmp = pd.DataFrame(data=None, columns=data.columns)
for x, y in zip(unlabelled_data, y_pred):
    label_name = inv_labels[y]
    tmp = tmp.append({'filename': x[0], 'label_str': inv_labels[y], 'label':int(y), label_name:1}, ignore_index=True)
tmp = tmp.fillna(int(0))

In [40]:
tmp.head()

Unnamed: 0,filename,diabetic_retinopathy,glaucoma,normal,label_str,label
0,43fb472ae7e8.jpg,0,1,0,glaucoma,1
1,2f091a49f704.jpg,0,1,0,glaucoma,1
2,352052af1e29.jpg,1,0,0,diabetic_retinopathy,0
3,3babd2098a14.jpg,1,0,0,diabetic_retinopathy,0
4,2eecb9318d63.jpg,1,0,0,diabetic_retinopathy,0


In [41]:
#copy unlabelled data to train
src_file_path = './Data'
for _, row in tmp.iterrows():
    src_file_path = os.path.join(unlabelled_data_dir, row.filename)
    dest_file_path = os.path.join(DATA_DIR, row.filename)
    shutil.copy(src_file_path, dest_file_path)

In [42]:
# prepare new csv that includes unlabelled data.
data = data.append(tmp, ignore_index=True)
data.rename(columns = {'diabetic_retinopathy': 'diabetic retinopathy'}, inplace=True)
data = data[['filename','diabetic retinopathy',  'glaucoma', 'normal']]
data.to_csv(os.path.join('./Data/train/train_augmented.csv'))

In [43]:
train_generator, valid_generator = get_data_generators(DATA_DIR, IMAGE_SIZE, BATCH_SIZE)

Found 551 validated image filenames belonging to 3 classes.
Found 1288 validated image filenames belonging to 3 classes.


In [44]:
#model accuracy against unlabelled data
loss, acc = model.evaluate(train_generator, verbose=2)

41/41 - 6s - loss: 0.3694 - accuracy: 0.9045


In [45]:
loss, acc = model.evaluate(valid_generator, verbose=2)

18/18 - 2s - loss: 0.4046 - accuracy: 0.8838


In [46]:
# get generators against new csv
train_generator, valid_generator = get_data_generators(DATA_DIR, IMAGE_SIZE, BATCH_SIZE, csv_file='./Data/train/train_augmented.csv')

Found 656 validated image filenames belonging to 3 classes.
Found 1533 validated image filenames belonging to 3 classes.


In [47]:
loss, acc = model.evaluate(train_generator, verbose=2)

48/48 - 7s - loss: 0.8352 - accuracy: 0.7704


In [48]:
loss, acc = model.evaluate(valid_generator, verbose=2)

21/21 - 3s - loss: 0.3924 - accuracy: 0.8918


In [None]:
# Vanilla evaluation without retraining => generalization much needed
# Retraining with new data set

In [49]:
model = resnet_model(input_shape=INPUT_SHAPE, num_classes = len(train_generator.class_indices))

In [50]:
conv_base = model.layers[0]
for i in range(-8,0):
    conv_base.layers[i].trainable = True

In [51]:
learning_rate = 1e-3
model.compile(
  #optimizer=tf.keras.optimizers.SGD(lr=0.005, momentum=0.9), 
  optimizer=tf.keras.optimizers.RMSprop(learning_rate=learning_rate), 
  loss=tf.keras.losses.CategoricalCrossentropy(),
  metrics=['accuracy'])

In [52]:
steps_per_epoch = train_generator.samples // train_generator.batch_size
validation_steps = valid_generator.samples // valid_generator.batch_size

fit_kwargs = dict(steps_per_epoch=steps_per_epoch, epochs=30, validation_steps=validation_steps, 
                  validation_data=valid_generator, verbose=1)

In [53]:
out = model.fit(train_generator, **fit_kwargs)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [54]:
# With label propagate => shows the model performance has decreased, as the unlaballed data set represent more classes (diff distribution) than the filtered one