In [1]:
import numpy as np
import os
from matplotlib import pyplot as plt
import cv2
import csv
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Conv2D #images are two dimensional. Videos are three dimension.
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense

from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator

In [2]:
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input

In [3]:
from keras.optimizers import Adam
from datetime import datetime
from keras.callbacks import ModelCheckpoint

In [4]:
from auto_label import auto_label, PRECISION_SOLO

# Data Precrocessing

In [5]:
def get_cluster_num_str(c):
    c = str(c).zfill(3)
    return c

In [6]:
# Set the target segmentation results 
res_folder = 'large_clusters_rec'
seg_model = 'k-means'   # choose between 'gmm' and 'k-means'
seg_nd = '3d'   # choose between '3d' and '4d'
cluster_num = 16   # choose between 16, 32, 64, and 128

### Prepare Dataframe for Data Generator

In [7]:
# corresponding csv file to get slice & cluster combination
csv_file = os.path.join(os.getcwd(), 'evaluation_rec_f1', '{}_{}_{}_f1.csv'.format(seg_model, seg_nd, cluster_num))
df = pd.read_csv(csv_file, usecols = ['slice', 'current_cluster'])
df['filename'] = df['slice'].map(str) + '\VA10_0050_0' + df['slice'].map(str) + '_' + df['current_cluster'].map(get_cluster_num_str)  + '.rec.8bit.png'

In [9]:
# prepare labels for data generator
cluster_label_one_slice = auto_label(seg_model, seg_nd, cluster_num, PRECISION_SOLO, threshold=0)
cluster_label = np.array(cluster_label_one_slice*401)
df['class'] =cluster_label
df['class'] = df['class'].map(str)

### Train_Val_Test Split

In [10]:
# randomly get 200 slices from [400, 800] and them select all clusters for these slices to form a balanced training set
slice_list = np.array(range(400, 801)) # [400,800] inclusive, 401 in total
train_slice, other_set = train_test_split(slice_list, test_size=0.5, random_state=104)
val_slice, test_slice = train_test_split(other_set, test_size=0.5, random_state=104)

In [11]:
train_set = df.loc[df['slice'].isin(train_slice)]
train_set.reset_index()
val_set = df.loc[df['slice'].isin(val_slice)]
val_set.reset_index()
test_set = df.loc[df['slice'].isin(test_slice)]
test_set.reset_index()

Unnamed: 0,index,slice,current_cluster,filename,class
0,144,409,0,409\VA10_0050_0409_000.rec.8bit.png,2
1,145,409,1,409\VA10_0050_0409_001.rec.8bit.png,3
2,146,409,2,409\VA10_0050_0409_002.rec.8bit.png,1
3,147,409,3,409\VA10_0050_0409_003.rec.8bit.png,1
4,148,409,4,409\VA10_0050_0409_004.rec.8bit.png,2
...,...,...,...,...,...
1611,6395,799,11,799\VA10_0050_0799_011.rec.8bit.png,4
1612,6396,799,12,799\VA10_0050_0799_012.rec.8bit.png,1
1613,6397,799,13,799\VA10_0050_0799_013.rec.8bit.png,2
1614,6398,799,14,799\VA10_0050_0799_014.rec.8bit.png,2


### Get Data Generator for Train/Val/Test

In [12]:
batch_size = 16

# Data path: used in data generator
base_folder = os.path.join(os.getcwd(), res_folder, seg_model, seg_nd, 'cluster_{}'.format(cluster_num))

train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)


train_generator = train_datagen.flow_from_dataframe(dataframe=train_set, directory=base_folder,
                                             x_col='filename',
                                             y_col='class',
                                             target_size=(700, 855),
                                             batch_size=batch_size,
                                             shuffle=True,
                                             class_mode='categorical',
                                             seed=7)

val_generator = val_datagen.flow_from_dataframe(dataframe=val_set, directory=base_folder,
                                             x_col='filename',
                                             y_col='class',
                                             target_size=(700, 855),
                                             batch_size=batch_size,
                                             shuffle=True,
                                             class_mode='categorical',
                                             seed=7)


test_generator = test_datagen.flow_from_dataframe(dataframe=test_set, directory=base_folder,
                                             x_col='filename',
                                             y_col=None,
                                             target_size=(700, 855),
                                             batch_size=batch_size,
                                             shuffle=False,
                                             class_mode=None,
                                             seed=7
                                             )

Found 3200 validated image filenames belonging to 4 classes.
Found 1600 validated image filenames belonging to 4 classes.
Found 1616 validated image filenames.


# Model

In [13]:
vgg16_model = VGG16(include_top=False, input_shape=(700, 855, 3))
for layer in vgg16_model.layers:
    layer.trainable = False
x = Flatten()(vgg16_model.output)
prediction = Dense(4, activation='softmax')(x)
model = Model(inputs=vgg16_model.input, outputs=prediction)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 700, 855, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 700, 855, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 700, 855, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 350, 427, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 350, 427, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 350, 427, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 175, 213, 128)     0     

# Training 

Skip if model already trained; Go to next session 'Load Model'

In [17]:
glob.glob('*.h5')

['vgg16.h5']

In [14]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=val_generator.n//val_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

In [19]:
model_name = '{}_{}_{}_vgg16.h5'.format(seg_model, seg_nd, cluster_num)
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=[keras.metrics.Accuracy(), keras.metrics.Precision(), keras.metrics.Recall()])
checkpoint = ModelCheckpoint(model_name, save_best_only=True, monitor='val_loss', mode='min')
callbacks = [checkpoint]

In [20]:
start = datetime.now()
model_history = model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=val_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=7,
                    callbacks=callbacks, verbose=1)

duration = datetime.now() - start
print("Training time: ", duration)

  model_history = model.fit_generator(generator=train_generator,


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Training time:  0:09:52.154171


# Load Trained Model

In [None]:
model_name = '{}_{}_{}_vgg16.h5'.format(seg_model, seg_nd, cluster_num)
model = load_model(model_name)

# Evaluation & Tests

In [21]:
test_generator.reset()
pred=model.predict(test_generator,
                             steps=STEP_SIZE_TEST,
                             verbose=1)



In [27]:
predicted_class_indices=np.argmax(pred,axis=1)
# test_df = test_set.reset_index()
# test_df['vgg16_predict'] = pd.Series(predicted_class_indices)
#"""need to test the function below first"""
gold_label = test_set['class'].map(int)
gold_label = gold_label.to_numpy()
# p, r, f1, _ = precision_recall_fscore_support(gold_label, predicted_class_indices, average=None, labels=[1,2,3,4])
# acc = accuracy_score(gold_label, predicted_class_indices)
# print("Test Accuracy: ", acc)
# print(p, r, f1)


NameError: name 'int64' is not defined

In [28]:
predicted_class_indices

array([1, 2, 0, ..., 1, 1, 3], dtype=int64)

In [29]:
gold_label

array([2, 3, 1, ..., 2, 2, 4], dtype=int64)

In [None]:
# naming convention need to be check