# Hyper parameter tuning
## Part 1: Balanced data via Data Augmentation

In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from os import walk
import time
import regex as re
import os
import glob
import tensorflow as tf
import warnings
import matplotlib
import h5py
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [2]:
!jupyter nbconvert --output-dir="./reqs" --to script 6_Model_Training_XCT-Debugging
!cd reqs
!pipreqs --print

[NbConvertApp] Converting notebook 6_Model_Training_XCT-Debugging.ipynb to script
[NbConvertApp] Writing 30204 bytes to reqs\6_Model_Training_XCT-Debugging.py


h5py==3.7.0
matplotlib==3.5.3
MedPy==0.4.0
numpy==1.23.2
opencv_python==4.6.0.66
pandas==1.4.3
regex==2022.8.17
scikit_image==0.19.3
scikit_learn==1.1.2
skimage==0.0
tensorflow==2.10.0
tensorflow_gpu==2.7.0


INFO: Successfully output requirements


# 1. Data Acqusition

In [3]:
def load_all_image_path(img_dir):
    
    #img_dir = "./Build2" # Enter Directory of all images
    img_labels = []
    for(_, _, filenames) in walk(img_dir):
        img_labels.extend(filenames)
        break
    data_path = os.path.join(img_dir,'*g')
    files = glob.glob(data_path)
    return files, img_labels
    

### Data Labels
__Layers with porosity__
> Followig are the index numbers of porosity images from three cylinders, B1, B2 & B3. The labels were based on the CAD file
information.

__Old Labels__

In [4]:
b1_prosity_index = list(range(311,380)) + list(range(537, 554)) + list(range(628, 663)) + list(range(832, 862)) + list(range(936, 937)) + list(range(940, 953)) + list(range(1011, 1078)) + list(range(1145, 1152))
b2_prosity_index = list(range(311,380)) + list(range(428, 463)) + list(range(531, 560)) + list(range(640, 654)) + list(range(737, 753))
b3_prosity_index = list(range(420,456)) + list(range(519, 546)) + list(range(619, 634)) + list(range(719, 736)) + list(range(819, 827)) + list(range(919, 923))

__New Labels__

The following image indexes were wrongly labels in CAD-assisted labelling. All these images were wrongly labelled as porosity i.e. porosity=1. We relabelled these indexes from porosity to non-porosity labels. 

In [5]:
b1_remove_index = [311,312,313,318,320,325,326,335,340,366,369,374,375,376,537,538,539,540,541,542,543,544,545,546,547,
                       548,549,550,551,552,553,628,629,630,633,640,641,642,643,646,647,648,649,650,651,653,654,656,657,659,
                      661,662,833,832,833,834,835,836,837,838,840,842,843,844,845,846,847,849,850,851,852,853,855,857,936,
                       940,947,949,950,952,1011,1011,1012,1014,1018,1019,1020,1029,1030,1045,1075,1145,1146,1147,1148,1149,
                       1150,1151]


b2_remove_index = [320,324,429,430,431,432,433,434,437,450,451,452,456,459,462,531,532,533,534,535,536,537,
                      538,539,540,541,542,544,545,548,549,550,554,559,640,641,642,643,644,645,646,647,651,737,
                      740,741,742,743,744,745,748,750,751,752]


b3_remove_index = [420,423,425,436,439,442,449,453,519,521,522,533,534,538,541,542,543,620,621,622,627,629,
                      630,631,632,721,723,724,727,728,729,733,735,819,820,821,822,826,919,920,921,922]

In [6]:
#remove the indexex from B1_porosity_index list that are present in B1-remove_index list

b1_prosity_index = [x for x in b1_prosity_index if x not in b1_remove_index]

b2_prosity_index = [x for x in b2_prosity_index if x not in b2_remove_index]

b3_prosity_index = [x for x in b3_prosity_index if x not in b3_remove_index]

### Image selection and Cropping

> The following function receives a chunk of image's path and their corresponding labels. Not all the images in Build2 are relevant to our cylinders. Out of total 2922 images, only 963 images relevant to our 3d objects. Three cylinders names as B1, B2, B3 were printed. Images from 243 to 1243 are related to B1 and B2 cylinders. Whereas, B3 cylinder related images are ranges from 218 to 1218. 
<br><br><br>
Firstly, the images were read into a numpy array. The image dimensions are __height = 2600 and Width = 1420__. Each image is then cropped into three small sections. __Height=1250-1440 and width=650-1100__ is firstly croped from the whole powder bed image.  <br> <br>
The cropped image is further is divided into three parts, each containg the image of a cylinder[B1,B2,B3]. The coordinates of __B1=[h:0-190, w:0-150]__, __B2 = [h:0-190, w:150-300]__ , __B3 = [h:0-190, w:300-450]__. The three images were then stored in different folders on the hard-drive.  
> The __crop_save_images__ function read images from hard drive and crop out B1, B2, and B3 cylinders into individual images. It also labels the images. The label consisted of 
__label = Porosity_flag +  cylinder name + layer number__ 

In [8]:
def crop_save_images(files, directory, labels):
      
    for f1,lab in zip(files,labels):
        #F1 = File path.
        #lab = Image label
        #print("F1: " + str(f1))
        #print("Lab: " +  str(lab))
        ########## read image
        orig_img = cv2.imread(f1)

        ########### crop image
        img = orig_img[1250:1440, 650:1100]
        img1 = img[0:190,0:150]
        img2 = img[0:190,150:300]
        img3 = img[0:190,300:450]

        ########### Label Image
        
        tt = lab[:-4].split('_')
        #tt = layer number
        #print(tt[3])
        layer_no = int(tt[3])
        
        if (layer_no in b1_prosity_index):
            img_name_b1 = "1_B1_Layer_"+str(layer_no)+".jpg"
            #print(layer_no)
            #print("True")
        else:
            img_name_b1 = "0_B1_Layer_"+str(layer_no)+".jpg"
            #print("False")
            
        if (layer_no in b2_prosity_index):
            img_name_b2 = "1_B2_Layer_"+str(layer_no)+".jpg"
            #print(layer_no)
            #print("True")
        else:
            img_name_b2 = "0_B2_Layer_"+str(layer_no)+".jpg"
            #print("False")
            
        if (layer_no in b3_prosity_index):
            img_name_b3 = "1_B3_Layer_"+str(layer_no)+".jpg"
            #print(layer_no)
            #print("True")
        else:
            img_name_b3 = "0_B3_Layer_"+str(layer_no)+".jpg"
            #print("False")
        ########### store image
        if(layer_no>243 and layer_no<1243):
            img_name = directory[0] + img_name_b1
            matplotlib.image.imsave(img_name, img1)
        
            img_name = directory[1] + img_name_b2
            matplotlib.image.imsave(img_name, img2)
        if(layer_no>218 and layer_no<1218):
            img_name = directory[2] + img_name_b3
            matplotlib.image.imsave(img_name, img3)
        #break


In [9]:
# Load all image paths
img_dir_paths, img_names = load_all_image_path("D:/UoH_PhD_Exp/Data/Build2")

In [10]:
def var_info(var):
    print(type(var))
    print(len(var))
var_info(img_dir_paths)
var_info(img_names)

<class 'list'>
2922
<class 'list'>
2922


In [11]:
# Make directories to store crop images if the the directories don't exsists already
directories = ["D:/UoH_PhD_Exp/Data/Crop_images/B1/", "D:/UoH_PhD_Exp/Data/Crop_images/B2/", "D:/UoH_PhD_Exp/Data/Crop_images/B3/"]
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)

#### Remove all the old files in B1, B2 & B3 folder
> Since we will be cropping images many times depedning upon the task at hand. Therefore, it is necessary to delete the old cropped images before saving the new cropped images. The following code empty the directories.

In [12]:
for directory in directories:
    files = glob.glob(os.path.join(directory,"*"))
    for f in files:
        os.remove(f)

>Since out of all the images the first 1250 layers/images are relevant to our builts. That's why only the first 1250 are considered. For B1, B2, and B3, the effective printing layers are 217-1206. For simplicity, uniformity and avoiding complexity, the relevant 1250 layers are selected.

In [13]:
#cropping images
crop_save_images(img_dir_paths[217:1206] ,directories, img_names[217:1206]) 

### B1 Cylinder images

In [14]:
files, labels = load_all_image_path("D:/UoH_PhD_Exp/Data/Crop_images/B1/")
#print(labels[0])
#print(files[0])
data = []
b1_labels = list()
b1_layer_numbers = list()
for f1, lab in zip(files, labels):
    #print("lab:" + lab)
    layer_num = re.search('Layer_(.+?).jpg', lab).group(1)
    b1_layer_numbers.append("b1_"+str(layer_num))
    b1_labels.append(int(lab[0]))
    img = cv2.imread(f1)
    ######### Convert to Images to grey scale.
    #gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    data.append(img)
b1_images = np.array(data)
print("B1 image shape: ", b1_images[0].shape)
#print(b1_labels[0])
#print(b1_layer_numbers[0])
print("B1 images dataset shape: ",b1_images.shape)
(unique, counts) = np.unique(b1_labels, return_counts=True)
print(unique, counts)
print("Total non-porosity images in B1: ", counts[0])
print("Total porosity images in B1: ", counts[1])

B1 image shape:  (190, 150, 3)
B1 images dataset shape:  (963, 190, 150, 3)
[0 1] [826 137]
Total non-porosity images in B1:  826
Total porosity images in B1:  137


### B2 Cylinder images

In [15]:
files, labels = load_all_image_path("D:/UoH_PhD_Exp/Data/Crop_images/B2/")
data = []
b2_labels = list()
b2_layer_numbers = list()
for f1, lab in zip(files, labels):
    layer_num = re.search('Layer_(.+?).jpg', lab).group(1)
    b2_layer_numbers.append("b2_"+str(layer_num))
    img = cv2.imread(f1)
    b2_labels.append(int(lab[0]))
    ######## Convert to Images to grey scale.
    #gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    data.append(img)
b2_images = np.array(data)
print("B2 image shape: ", b2_images[0].shape)
#print(b2_labels[0])
#print(b2_layer_numbers[0])
print("B2 images dataset shape: ",b2_images.shape)
(unique, counts) = np.unique(b2_labels, return_counts=True)
print(unique, counts)
print("Total non-porosity images in B2: ", counts[0])
print("Total porosity images in B2: ", counts[1])

B2 image shape:  (190, 150, 3)
B2 images dataset shape:  (963, 190, 150, 3)
[0 1] [854 109]
Total non-porosity images in B2:  854
Total porosity images in B2:  109


### B3 Cylinder images

In [16]:
files, labels = load_all_image_path("D:/UoH_PhD_Exp/Data/Crop_images/B3")
data = []
b3_labels = list()
b3_layer_numbers = list()
for f1, lab in zip(files, labels):
    layer_num = re.search('Layer_(.+?).jpg', lab).group(1)
    b3_layer_numbers.append("b3_"+str(layer_num))
    img = cv2.imread(f1)
    b3_labels.append(int(lab[0]))
    ######## Convert to Images to grey scale.
    #gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    data.append(img)
b3_images = np.array(data)
print("B1 image shape: ", b3_images[0].shape)
#print(b3_labels[0])
#print(b3_layer_numbers[0])
print("B3 images dataset shape: ",b3_images.shape)
(unique, counts) = np.unique(b3_labels, return_counts=True)
print(unique, counts)
print("Total non-porosity images in B3: ", counts[0])
print("Total porosity images in B3: ", counts[1])

B1 image shape:  (190, 150, 3)
B3 images dataset shape:  (963, 190, 150, 3)
[0 1] [898  65]
Total non-porosity images in B3:  898
Total porosity images in B3:  65


### Augmented porosity images
> Here first, we will contatenate all the B1,B2 and B3 images into one dataset. Then we will separate all porosity images. The porosity images will be use to generate similar augmented porosity images.

In [22]:
X = np.concatenate((b1_images, b2_images, b3_images), axis=0)
y = b1_labels + b2_labels + b3_labels
layer_nums = b1_layer_numbers + b2_layer_numbers + b3_layer_numbers
print("X Shape: " + str(X.shape))
(unique, counts) = np.unique(y, return_counts=True)
print("Total non-porosity images: ", counts[0])
print("Total porosity images: ", counts[1])

X Shape: (2889, 190, 150, 3)
Total non-porosity images:  2578
Total porosity images:  311


In [23]:
# Take indexes in list y where y=1 i.e. porosity labels.
porosity_indices = np.where(y)[0]
print(len(porosity_indices))

#Take corresponding porosity images from the images dataset X.
porosity_imgs = X[porosity_indices]
print(len(porosity_imgs))

311
311


In [24]:
from keras.preprocessing.image import ImageDataGenerator
#from skimage import io
datagen = ImageDataGenerator(        
        horizontal_flip = True,
        vertical_flip = True,
        width_shift_range=0.1,
        height_shift_range=0.1,
        fill_mode='nearest', #reflect
        
)
#import numpy as np
#import os
#from PIL import Image

x = porosity_imgs
i = 0
for batch in datagen.flow(x, batch_size=23,
                          save_to_dir= r'D:/UoH_PhD_Exp/Data/Augmented_imgs',
                          save_prefix='dr',
                          save_format='jpg'):    
    i += 1    
    if i > 100:        
        break

In [27]:
files, labels = load_all_image_path("D:/UoH_PhD_Exp/Data/Augmented_imgs")
data = []
augmented_labels = list()
for f1, lab in zip(files, labels):
    img = cv2.imread(f1)
    augmented_labels.append(int(1))
    ######## Convert to Images to grey scale.
    #gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    data.append(img)
augmented_images = np.array(data)
print(augmented_images[0].shape)
print(augmented_labels[0])

print("Total augmented porosity images: ", len(augmented_labels))

(190, 150, 3)
1
Total non-porosity images in B3:  2246


# Concatenate B1 + B2 + B3 + Augmented images = Balanced dataset

In [31]:
XX = np.concatenate((b1_images, b2_images, b3_images, augmented_images), axis=0)
yy = b1_labels + b2_labels + b3_labels + augmented_labels
#layer_nums = b1_layer_numbers + b2_layer_numbers + b3_layer_numbers
print("XX Shape: " + str(XX.shape))
print("Total yy: " + str(len(yy)))

(unique, counts) = np.unique(yy, return_counts=True)
print("Total non-porosity images in balanced dataset: ", counts[0])
print("Total porosity images in balanced dataset: ", counts[1])

XX Shape: (5135, 190, 150, 3)
Total yy: 5135
Total non-porosity images in balanced dataset:  2578
Total porosity images in balanced dataset:  2557


## Store Images on hard-drive as HDF5 formate

In [32]:
def store_many_hdf5(images, labels, file_path):
    """ Stores an array of images to HDF5.
        Parameters:
        ---------------
        images       images array, (N, 32, 32, 3) to be stored
        labels       labels array, (N, 1) to be stored
    """
    hdf5_dir = file_path
    num_images = len(images)

    # Create a new HDF5 file
    file = h5py.File(hdf5_dir, "w")

    # Create a dataset in the file
    dataset = file.create_dataset(
        "images", np.shape(images), h5py.h5t.STD_U8BE, data=images
    )
    meta_set = file.create_dataset(
        "meta", np.shape(labels), h5py.h5t.STD_U8BE, data=labels
    )
    file.close()

In [33]:
dir_file_path = "D:/UoH_PhD_Exp/Data/Data_HDF/Porosity_Balanced_Data.h5"
store_many_hdf5(XX,yy, dir_file_path)

In [34]:
def read_many_hdf5(num_images, file_path):
    """ Reads image from HDF5.
        Parameters:
        ---------------
        num_images   number of images to read

        Returns:
        ----------
        images      images array, (N, 32, 32, 3) to be stored
        labels      associated meta data, int label (N, 1)
    """
    images, labels = [], []

    # Open the HDF5 file
    file = h5py.File(file_path, "r+")

    images = np.array(file["/images"]).astype("uint8")
    labels = np.array(file["/meta"]).astype("uint8")

    return images, labels

In [38]:
dir_file_path = "D:/UoH_PhD_Exp/Data/Data_HDF/Porosity_Balanced_Data.h5"
X, y = read_many_hdf5(0,dir_file_path)
print(X.shape)
print(len(y))
(unique, counts) = np.unique(y, return_counts=True)
print("Total non-porosity images in balanced dataset: ", counts[0])
print("Total porosity images in balanced dataset: ", counts[1])

(5135, 190, 150, 3)
5135
Total non-porosity images in balanced dataset:  2578
Total porosity images in balanced dataset:  2557
