# Data Provider for CNN in Supervised Learning Phase

Aim: assign 16/32/64/128 clusters to 4 true classes: pore, gypsum, celestite, bassanite

Currently, this is for experimenting vgg16 with round seg res and then rec res. Beeter file formatting can be found in specific jupytor notebook.

## VGG-16 (k-fold?)

In [2]:
import numpy as np
import os
from matplotlib import pyplot as plt
import cv2
import csv
import pandas as pd
import glob
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Conv2D #images are two dimensional. Videos are three dimension.
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense

from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator

In [2]:
# Set the target segmentation results 
res_folder = 'large_clusters_rec'
seg_model = 'k-means'   # choose between 'gmm' and 'k-means'
seg_nd = '3d'   # choose between '3d' and '4d'
cluster_num = 16   # choose between 16, 32, 64, and 128

# Data path
base_folder = os.path.join(os.getcwd(), res_folder, seg_model, seg_nd, 'cluster_{}'.format(cluster_num))

# corresponding label csv file
csv_file = os.path.join(os.getcwd(), '{}_{}_{}_f1.csv'.format(seg_model, seg_nd, cluster_num))

# read csv file
df = pd.read_csv(csv_file, usecols = ['slice', 'current_cluster'])

In [3]:
def get_cluster_num_str(c):
    c = str(c).zfill(3)
    return c

In [4]:
# get filename as new column

df['filename'] = df['slice'].map(str) + '\VA10_0050_0' + df['slice'].map(str) + '_' + df['current_cluster'].map(get_cluster_num_str)  + '.rec.8bit.png'
df['predict_class'] = df['predict_class'].map(str)

In [8]:
# randomly get 200 slices from [400, 800] and them select all clusters for these slices to form a balanced training set
slice_list = np.array(range(400, 801)) # [400,800] inclusive, 401 in total
train_slice, other_set = train_test_split(slice_list, test_size=0.5, random_state=104)
val_slice, test_slice = train_test_split(other_set, test_size=0.5, random_state=104)

In [9]:
train_set = df.loc[df['slice'].isin(train_slice)]
val_set = df.loc[df['slice'].isin(val_slice)]
test_set = df.loc[df['slice'].isin(test_slice)]

In [10]:
train_set.head(5)

Unnamed: 0,slice,current_cluster,predict_class,filename
0,400,0,1,400\VA10_0050_0400_000.rec.8bit.png
1,400,1,1,400\VA10_0050_0400_001.rec.8bit.png
2,400,2,3,400\VA10_0050_0400_002.rec.8bit.png
3,400,3,1,400\VA10_0050_0400_003.rec.8bit.png
4,400,4,1,400\VA10_0050_0400_004.rec.8bit.png


# Data Preprocessing

### Crop the segmentation result to leave only the ROI (700, 855) [DO NOT RUN BEFORE CHECKING THE CURRENT SIZE OF TARGETS]

In [52]:
# Set the target segmentation results 
res_folder = 'large_clusters_rec'
seg_model = 'gmm'   # choose between 'gmm' and 'k-means'
seg_nd = '4d'   # choose between '3d' and '4d'
cluster_num = 128   # choose between 16, 32, 64, and 128

# Data path
base_folder = os.path.join(os.getcwd(), res_folder, seg_model, seg_nd, 'cluster_{}'.format(cluster_num))

In [53]:
base_folder

'd:\\MSc-Project\\large_clusters_rec\\gmm\\4d\\cluster_128'

```
imgs = glob.glob(os.path.join(base_folder, '*', '*.png'))
for i in imgs:
    crop = cv2.imread(i)[353:1053, 282:1137]
    cv2.imwrite(i, crop)
```

k-means 3d 16: 2m 4.2s
k-means 4d 16: 2m 2.9s
k-means 3d 32: 4m 2.3s
k-means 4d 32: 4m 5.6s
k-means 3d 64: 7m 53.6s
k-means 4d 64: 7m 54.9s
k-means 3d 128: 15m 40.2s
k-means 4d 128: 15m 37.6s
gmm 3d 16: 1m 59.2s
gmm 4d 16: 2m 2.5s
gmm 3d 32: 3m 50.0s
gmm 4d 32: 3m 47.9s
gmm 3d 64: 7m 31.4s
gmm 4d 64: 7m 35.2s
gmm 3d 128: 14m 50.3s
gmm 4d 128: 14m 51.1s


In [23]:
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input

In [29]:
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)


train_generator = train_datagen.flow_from_dataframe(dataframe=train_set, directory=base_folder,
                                             x_col='filename',
                                             y_col='predict_class',
                                             target_size=(700, 855),
                                             batch_size=16,
                                             shuffle=True,
                                             class_mode='categorical',
                                             seed=7)

val_generator = val_datagen.flow_from_dataframe(dataframe=val_set, directory=base_folder,
                                             x_col='filename',
                                             y_col='predict_class',
                                             target_size=(700, 855),
                                             batch_size=16,
                                             shuffle=True,
                                             class_mode='categorical',
                                             seed=7)


test_generator = test_datagen.flow_from_dataframe(dataframe=test_set, directory=base_folder,
                                             x_col='filename',
                                             y_col=None,
                                             target_size=(700, 855),
                                             batch_size=16,
                                             shuffle=False,
                                             class_mode=None,
                                             seed=7
                                             )

Found 3200 validated image filenames belonging to 4 classes.
Found 1600 validated image filenames belonging to 4 classes.
Found 1616 validated image filenames.


# Experiments with VGG16

conda create --name base-tf --clone tf-gpu-nvcc

tf-gpu-nvcc is the one used for all previous development but had conflicts for version of h5py in conda and pip. Did not find out the solution is to uninstall the version in pip, not uninstall from conda. because jupyter is always trying to refer to packages in conda. Also tried to roll-back to previous version of conda env by conda install --revision & conda install --rev 2, but nothing seemed to happen.

base-tf is a clone version of tf-gpu-nvcc before revision was done.

test-uninstall is a clone version of tf-gpu-nvcc after revision. Found the reason of DLL error was because of version conflicts in conda and pip. Solution is to delete either conda's h5py or pip's h5py. 'cannot find File attribute'  is due to jupyter trying to use the packages inside conda env. Solution is to uninstall pip one rather than conda one. 

In [30]:
vgg16_model = VGG16(include_top=False, input_shape=(700, 855, 3))

In [31]:
for layer in vgg16_model.layers:
    layer.trainable = False

In [32]:
x = Flatten()(vgg16_model.output)
prediction = Dense(4, activation='softmax')(x)
model = Model(inputs=vgg16_model.input, outputs=prediction)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 700, 855, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 700, 855, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 700, 855, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 350, 427, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 350, 427, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 350, 427, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 175, 213, 128)     0   

In [33]:
from keras.optimizers import Adam

In [34]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=val_generator.n//val_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

In [35]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [36]:
from datetime import datetime
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('vgg16.h5', save_best_only=True, monitor='val_loss', mode='min')
callbacks = [checkpoint]

start = datetime.now()
model_history = model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=val_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=7,
                    callbacks=callbacks, verbose=1)

duration = datetime.now() - start
print("Training time: ", duration)

  model_history = model.fit_generator(generator=train_generator,


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Training time:  0:10:35.132742


In [37]:
test_generator.reset()
pred=model.predict_generator(test_generator,
                             steps=STEP_SIZE_TEST,
                             verbose=1)

  pred=model.predict_generator(test_generator,




array([[0.0000000e+00, 1.0000000e+00, 8.7173673e-38, 5.8059596e-14],
       [1.7404351e-36, 1.0000000e+00, 0.0000000e+00, 1.5984313e-19],
       [1.2185826e-02, 4.3380901e-14, 1.2810710e-21, 9.8781419e-01],
       ...,
       [8.4862138e-30, 4.5451602e-26, 0.0000000e+00, 1.0000000e+00],
       [0.0000000e+00, 3.5413367e-21, 0.0000000e+00, 1.0000000e+00],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 4.1316549e-23]],
      dtype=float32)

In [38]:
predicted_class_indices=np.argmax(pred,axis=1)

In [43]:
# test_set['res'] = pd.Series(predicted_class_indices)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['res'] = pd.Series(predicted_class_indices)


In [46]:
test_df = test_set.reset_index()

In [47]:
test_df['label'] = pd.Series(predicted_class_indices)

In [56]:
correct = test_df[test_df['predict_class'] == test_df['label'].map(str)].shape[0]
print("Test Accuracy: ", correct/len(predicted_class_indices))

Test Accuracy:  0.9876237623762376


# Attempt to alter data structure of the folder of segmentation results to use flow_from_dirctory, but then figure out there's a function called flow_from_dataframe

In [34]:
# create directories following the requiremnets of flow_from_dict and ImageDataGenerator(...).flow()
train_dir = os.path.join(os.getcwd(), 'test_dst')
os.makedirs(train_dir)

for i in range(4):
    os.makedirs(os.path.join(train_dir, str(i)))

In [20]:
#all_files = glob.glob(os.path.join(os.getcwd(), 'large_clusters_rec', 'gmm', '3d', 'cluster_16', '*', '*.png'))
all_files = glob.glob(os.path.join(os.getcwd(), 'test_src', 'gmm', '3d', 'cluster_16', '*', '*.png'))
all_files

['d:\\MSc-Project\\test_src\\gmm\\3d\\cluster_16\\400\\VA10_0050_0400_000.rec.8bit.png',
 'd:\\MSc-Project\\test_src\\gmm\\3d\\cluster_16\\400\\VA10_0050_0400_001.rec.8bit.png',
 'd:\\MSc-Project\\test_src\\gmm\\3d\\cluster_16\\400\\VA10_0050_0400_002.rec.8bit.png',
 'd:\\MSc-Project\\test_src\\gmm\\3d\\cluster_16\\400\\VA10_0050_0400_003.rec.8bit.png',
 'd:\\MSc-Project\\test_src\\gmm\\3d\\cluster_16\\753\\VA10_0050_0753_000.rec.8bit.png',
 'd:\\MSc-Project\\test_src\\gmm\\3d\\cluster_16\\753\\VA10_0050_0753_001.rec.8bit.png',
 'd:\\MSc-Project\\test_src\\gmm\\3d\\cluster_16\\753\\VA10_0050_0753_002.rec.8bit.png',
 'd:\\MSc-Project\\test_src\\gmm\\3d\\cluster_16\\753\\VA10_0050_0753_003.rec.8bit.png']

In [39]:
test_df = df.loc[df['slice'].isin(slice_list)]

In [50]:
src_path = os.path.join(os.getcwd(), 'test_src', 'gmm', '3d', 'cluster_16')
dst_path = os.path.join(os.getcwd(), 'test_dst')

In [51]:
for row in test_df.itertuples(index=False):
    src_p = os.path.join(src_path, str(row.slice), row.filename)
    img = cv2.imread(src_p)[353:1053, 282:1137]
    dst_p = os.path.join(dst_path, str(row.predict_class), row.filename)
    cv2.imwrite(dst_p, img)


# Some notes and todos

In [None]:
# imbalanced dataset again
# 

In [None]:
# find the threshold and use the labeling to evaluate the performance as a whole --> test labeling
# use the finalized labeling for cnns
# visualise csv files to get more findings
# Using CNN at this stage, could also help find features. especially the transfering learning, we can freeze layers to utilize in future work.

In [None]:
#for the above tasks, try to first build up the pipeline for combining different clusters and evaluate. --> basic functions has already been established bi visual_plots file. Can use do.

In [2]:
p = os.getcwd()
l = glob.glob(os.path.join(p, '*.ipynb'))
l

['d:\\MSc-Project\\cnn_data_loader.ipynb',
 'd:\\MSc-Project\\testing_func.ipynb',
 'd:\\MSc-Project\\test_keras_fun.ipynb',
 'd:\\MSc-Project\\unsupervised_evaluation.ipynb',
 'd:\\MSc-Project\\volume_analysis.ipynb']

In [6]:
p = os.getcwd()
l = glob.glob(os.path.join(p, '*.h5'))
l

['d:\\MSc-Project\\vgg16.h5']