# 1. Install Dependencies and Setup

In [None]:
!pip install tensorflow tensorflow opencv-python matplotlib

TF for Keras Learning, OpenCV for Image Processing,MatPlotLib for Visualizations

In [None]:
!pip list

Lists all packages installed on pip

In [3]:
import tensorflow as tf
import os

Imports TF, OS used to navigate file structures

# 2. Preprocess Images, remove bad ones

In [4]:
import cv2
import imghdr

In [5]:
data_dir = 'data' #Sets data folder as directory

In [6]:
os.listdir(data_dir) #Lists all files in directory

['benign', 'malignant']

In [None]:
os.listdir(os.path.join(data_dir, 'malignant')) #Lists all files in folder

At this point, go through folder and remove images that are <10 KB, or blatantly wrong filetype

In [8]:
image_exts = ['jpeg','jpg', 'bmp', 'png']

In [9]:
#img = cv2.imread(os.path.join('folder', 'class', 'img')) selects image
#img.shape: returns arr(x,y,z): X: Height, Y: Width, Z: Channels
#plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
#plt.show() Returns image, recolors from BGR to RGB, show removes image code

In [10]:
for image_class in os.listdir(data_dir): #For class in .\data
    for image in os.listdir(os.path.join(data_dir, image_class)): #For image in class
        image_path = os.path.join(data_dir, image_class, image) #Goes through data -> Class -> Image
        try: 
            img = cv2.imread(image_path) #Reads image using OpenCV as NumPy array
            #img.shape: returns arr(x,y,z): X: Height, Y: Width, Z: Channels
            tip = imghdr.what(image_path)
            if tip not in image_exts: 
                print('Image not in ext list {}'.format(image_path))
                os.remove(image_path) #Deletes file
        except Exception as e: 
            print('Issue with image {}'.format(image_path))
            # os.remove(image_path)

# 3. Load Data

In [11]:
#tf.data.Dataset?? --> Build data pipeline, to scale large datasets and 
#Can repeat sets of data

In [12]:
import numpy as np
from matplotlib import pyplot as plt

In [None]:
data = tf.keras.utils.image_dataset_from_directory??


In [None]:
data = tf.keras.utils.image_dataset_from_directory('data')
#Builds image dataset, don't need to build labels, classes, and does 
#Preprocessing like resizing, etc.

In [15]:
data_iterator = data.as_numpy_iterator()
#Converts data into numpy iterator to view data as specific values to access

In [16]:
batch = data_iterator.next()
#Returns a data batch, accesses the data pipeline

In [17]:
#batch
#len(batch)=2, 1st part is image, second is shape (32, 356, 256, 3)
#Len() is from the tf.keras.utils....., the default dataset size

In [None]:
fig, ax = plt.subplots(ncols=4, figsize=(20,20)) #Matplotlib subplot to plot
#Multiple images, show the batch
for idx, img in enumerate(batch[0][:4]):
    ax[idx].imshow(img.astype(int))
    ax[idx].title.set_text(batch[1][idx])
    
#Can run batch=data_iterator.next() to change batch for loop to be shown
#Class 1 = Malignant, Class 0 = Benign

# 4. Scale Data (Preprocessing)

In [None]:
data = data.map(lambda x,y: (x/255, y))
#Map allows us to apply lambda transformation to speed up
#How fast data comes from disk

In [None]:
data.as_numpy_iterator().next()
#Accesses data as iterator, grabs next batch

# 5. Split Data

In [21]:
len(data) #Number of batches

82

In [22]:
train_size = int(len(data)*.7)
val_size = int(len(data)*.2)
test_size = int(len(data)*.1)+1
#Made to avoid overfitting, splits up data into training, validation, test

In [None]:
train_size,val_size, test_size

In [24]:
train = data.take(train_size)
val = data.skip(train_size).take(val_size)
test = data.skip(train_size+val_size).take(test_size)
#Take: How much data to take in a particular partition
#Skip: Skips batches already allocated, takes next one

# 6. Build Deep Learning Model

In [None]:
train

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten

#Sequential API used for one data input and one output
#Functional API for multiple connections, different features, etc.
'''Layers from keras are CNN layer, MaxPooling condenses images, 
so it only returns the max values, dense connects the layer, flatten allows
for convolutional layer with channels/kernels'''

In [31]:
model = Sequential()
#Can pass layers through inside model but add is more convenient

In [32]:
model.add(Conv2D(16, (3,3), 1, activation='relu', input_shape=(256,256,3)))
model.add(MaxPooling2D())
'''Adds convolutional layer and maxpooling layer
First layer in NN is input layer in convolution
Has 16 filters, scans through image with filters (Filter is 3x3pixels)
Each layer moves 1 pixel at a time
Relu activation output, only preserves positive values
    - Takes into account nonlinear patterns
    - Popular ones include sigmoid, etc
    - Passing data through function to map data, reshaping output
Input shape matches what keras mapped images to in preprocessing
MaxPooling() returns maximum value from relu transformation
    - 2x2region, takes max value from region
    - Reduces image data by half instead of having total rows/columns'''

model.add(Conv2D(32, (3,3), 1, activation='relu'))
model.add(MaxPooling2D())

#32 Filters, same information

model.add(Conv2D(16, (3,3), 1, activation='relu'))
model.add(MaxPooling2D())

model.add(Flatten())
#Condense rows and width, number of filters form channel value
#We want to condense it to a single value, when passing into dense

model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#Fully connected layer, all the dots converge into a point
#256 neurons, then condensed into a single dense layer (One output)
#Single outputrepresents a 0 or 1 with sigmoid output, ...
#... mapped to the two classes

In [None]:
model.compile('adam', loss=tf.losses.BinaryCrossentropy(), metrics=['accuracy'])
'''
- Pass through optimizer (adam), can do tf.optimizers to see list
- Specify loss, this is binary classification so use BC
- Metric is accuracy to see how well its classifying
'''

In [None]:
model.summary()

In [None]:
'''
Shows how model transforms data
First C-Layer conversto 254x254xfilters(16)
MaxPooling() goes to 127x127x16 (2,2) halves output, no params no training
New conv layer goes to 125x125x32 (Can preserve size by applying padding)
Apply maxPooling, halves data again
One more layer, 60x60x16 -> 30x30x16
Flatten does 30x30x16 becomes 14400 outputs passed into flatten layer
Pass it into dense, finally go into single output layer
3.69M Parameters
'''

# 7. Train

In [39]:
logdir='logs'
#Creates folder logs for logdir

In [40]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
#Useful for logging, saving at checkpoints, etc
#Setting out log_dir as the logdir folder
#Logs out model training, can see how model logs over time

In [None]:
hist = model.fit(train, epochs=20, validation_data=val, callbacks=[tensorboard_callback])
'''
Fit - training component/data
Epochs - howlong to train for, one epoch is a run over data set
Validation data runs evalutation on val for testing
Callbacks passes through callback, allows us to log data into tensorboard
Saves through hist to plot out our data
'''

In [None]:
'''
Loss represents loss on training data
Accuracy is on training data
Val_loss and val_accuracy represents on validation data
Loss should decrease steadily, accuracy should invert
'''

In [None]:
hist.history
#Returns information of accuracy, validation data, etc

# 8. Plot Performance

In [None]:
fig = plt.figure()
plt.plot(hist.history['loss'], color='teal', label='loss')
plt.plot(hist.history['val_loss'], color='orange', label='val_loss')
fig.suptitle('Loss', fontsize=20)
plt.legend(loc="upper left")
plt.show()

#Model is overfitting - may need to change some data or apply regularization
#Variance problem, loss metrics visualized

In [None]:
fig = plt.figure()
plt.plot(hist.history['accuracy'], color='teal', label='accuracy')
plt.plot(hist.history['val_accuracy'], color='orange', label='val_accuracy')
fig.suptitle('Accuracy', fontsize=20)
plt.legend(loc="upper left")
plt.show()

#Visualizes accuracy, uses MatPlotLib to show both accuracies

# 9. Evaluate

In [53]:
from tensorflow.keras.metrics import Precision, Recall, BinaryAccuracy

In [54]:
pre = Precision()
re = Recall()
acc = BinaryAccuracy()

#Establish instances, update them as we go on to make predictions

In [57]:
len(test)
#Number of batches in testing data

9

In [None]:
for batch in test.as_numpy_iterator(): 
    X, y = batch #X - images, y is true value
    yhat = model.predict(X) #Predict, return set of values between 0 and 1 from sigmoid transofrmation
    pre.update_state(y, yhat) #Updates method using true value and predicted value for all three metrics
    re.update_state(y, yhat)
    acc.update_state(y, yhat)
    
#Iterates through batch

In [None]:
print(f'Precision:{pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')
#Prints precision, recall and accuracy one a 0-1 scale, 1 is ideal

# 10. Test

In [None]:
img = cv2.imread('benigntest2.jpeg')
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.show()

In [None]:
resize = tf.image.resize(img, (256,256))
plt.imshow(resize.numpy().astype(int))
plt.show()

In [None]:
yhat = model.predict(np.expand_dims(resize/255, 0))
#Encapsulate inside another list as the NN processes a batch, not just an image

In [None]:
yhat

In [None]:
if yhat > 0.5: 
    print(f'Predicted class is Malignant')
else:
    print(f'Predicted class is Benign')

# 11. Save the Model

In [89]:
from tensorflow.keras.models import load_model

In [91]:
model.save(os.path.join('models','cancerIdentV1.h5'))

In [92]:
new_model = load_model(os.path.join('models','cancerIdentV1.h5'))

In [None]:
new_model.predict(np.expand_dims(resize/255, 0))