# Baseline CNN and creation of BSD

### This file has the results of the baseline CNN and the creation of BSD dataset. The steps performed are:

1. This dataset consists of 2302 training and 998 test samples. 
2. The experimental dataset is randomly split into train and test sets with a split ratio of 70:30.
3. The baseline CNN model has four convolutional layers with 15 filters each of size 3*3. 
4. Adam's technique is used as the optimizer. Binary cross-entropy is used as the loss function. 
5. This CNN is trained for 150 epochs. 
6. Softmax is used as the final activation function which outputs predicted probabilities. 
7. The BSD is created which contains all the samples whose confidence factor is less than a certain threshold

In [1]:
import scipy.misc
import random
import imageio
from keras.utils import to_categorical
import tensorflow as tf
from tensorflow.python.tools import freeze_graph
from tensorflow.python.tools import optimize_for_inference_lib
from keras.applications.inception_v3 import InceptionV3

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# from keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import plot_model
from keras import backend as K
import keras.optimizers as optimizers
# from keras.utils import np_utils
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.models import load_model
from keras.layers import Lambda, Conv2D, MaxPooling2D, Dropout, Dense, Flatten, Activation, BatchNormalization
from keras.optimizers import SGD,RMSprop,Adam

import imageio.v2 as imageio

The malignant and benign data is loaded to np arrays. 

In [2]:
xs = []
ys = []
def Load_data_malignant():
    path ="Dataset_Final/malignant"
    x_out = []
    y_out = []
    for i in range(1, 1500):
        img = imageio.imread(path +'/' + str(i) + '.jpg')
        lab = 1 
        x_out.append(img)
        y_out.append(lab)
    return x_out, y_out


In [3]:
def Load_data_benign():
    path ="Dataset_Final/benign"
    x_out = []
    y_out = []
    for i in range(1, 1800):
        img = imageio.imread(path +'/' + str(i) + '.jpg')
        lab = 0 
        x_out.append(img)
        y_out.append(lab)
    return x_out, y_out

The benign and malignant arrays are assigned to x and y arrays

In [4]:
x_0, y_0 = Load_data_benign()

In [5]:
x_1, y_1 = Load_data_malignant()

In [6]:
x_0 = np.array(x_0)
y_0 = np.array(y_0)

x_1 = np.array(x_1)
y_1 = np.array(y_1)


The arrays are concatenated to form x and y arrays (features, outputs)

In [7]:
num_classes = 2
x=np.concatenate((x_0, x_1), axis=0)
y=np.concatenate((y_0, y_1), axis=0)

y = to_categorical(y, num_classes)

The data is split as train and test with 70:30 split

In [8]:
x_train, x_Test, y_train, y_Test = train_test_split(x, y, test_size=0.3, random_state=5,stratify=y)

The CNN model is defined below

In [9]:
nClasses = 2

def createModel():
    model = Sequential()
    
    model.add(Conv2D(15, (3, 3), padding='valid', activation='relu', input_shape=(224,224,3)))
    model.add(BatchNormalization())
      
    model.add(Conv2D(15, (3, 3), padding='same',activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.3))
 
    model.add(Conv2D(15, (3, 3), padding='same',activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.3))
    
    model.add(Conv2D(15, (3, 3), padding='valid', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(nClasses, activation='softmax'))
    model.summary()
     
    return model

Optimizer is chosen as Adam, binary cross entropy is the loss function, with epochs as 150

In [10]:
model1 = None
model1 = createModel()
batch_size = 40
epochs = 150
opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99, epsilon=None, decay=0)
model1.compile(loss='binary_crossentropy', optimizer=opt,metrics = ['categorical_accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 222, 222, 15)      420       
                                                                 
 batch_normalization (Batch  (None, 222, 222, 15)      60        
 Normalization)                                                  
                                                                 
 conv2d_1 (Conv2D)           (None, 222, 222, 15)      2040      
                                                                 
 batch_normalization_1 (Bat  (None, 222, 222, 15)      60        
 chNormalization)                                                
                                                                 
 max_pooling2d (MaxPooling2  (None, 111, 111, 15)      0         
 D)                                                              
                                                        

The model is fit to the data

In [None]:
history=model1.fit(x_train, y_train, epochs=1,batch_size = batch_size,validation_data= (x_Test, y_Test), shuffle=True)



The accuracy and loss graphs are obtained

In [None]:
plt.plot(history.history['categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper right')
plt.show()

The classification report of the test data is printed

In [None]:
from sklearn.metrics import classification_report
pred = model.predict(x_Test, batch_size=32, verbose=1)
predicted = np.argmax(pred, axis=1)
report = classification_report(np.argmax(y_Test, axis=1), predicted)
print(report)

After observing the CPVS of the test samples and the respective outputs, the threshold is calculated as 99.99995 and the BSD is created

In [None]:
from PIL import Image
import numpy as np
import os 
for i in range(989):
    if abs((pred[i][0]*100) - (pred[i][1]*100)) < 99.99995 :
        img = Image.fromarray(x_Test[i], 'RGB')
        if np.argmax(y_Test[i])== 0:
                img.save(os.path.join('final/benign/',str(i)+'.jpg'))
        if np.argmax(y_Test[i])== 1:
                img.save(os.path.join('final/malignant/', str(i)+'.jpg'))