<a href="https://colab.research.google.com/github/anaaparamesh/Week1_Public/blob/master/JP_Transfer_Learning_Draft_2_NASA_Jr_Week_3_pickling_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
import tensorflow as tf
tf.test.is_gpu_available()

In [None]:
### Run this cell to import the packages you will need to unpack the dataset
# File manipulation and IO (input/output)
import os
import pickle
import zipfile
from google.colab import files

# Import numerical and dataframe handling
import numpy as np
import scipy
import pandas as pd

# Data preprocessing
from PIL import Image
from sklearn.utils import shuffle

# Model scoring
from sklearn.metrics import confusion_matrix
from sklearn import metrics

# Import standard machine learning machinery
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Garbage collection (for saving RAM during training)
import gc

# Import plotting functionality
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.ticker import MultipleLocator
import matplotlib

from skimage.color import gray2rgb
from skimage import img_as_ubyte
import time

In [None]:
# Set plotting preferences
%matplotlib inline
font = {'family' : 'sans-serif',
        'weight' : 'normal',
        'size'   : 16}
matplotlib.rc('font', **font)

In [None]:
# Set the path to the Week 3 challenge data on GitHub
github_data_path = 'https://raw.githubusercontent.com/BeaverWorksMedlytics2020/Data_Public/master/ChallengeProjects/Week3/'

# Download 3 zip files from github containing training, validation, and test images
os.system('wget '+ os.path.join(github_data_path, 'Mamm_Images_Test.zip'))
os.system('wget '+ os.path.join(github_data_path, 'Mamm_Images_Train.zip'))
os.system('wget '+ os.path.join(github_data_path, 'Mamm_Images_Val.zip'))

# Download the binary and multiclass labels for the 
os.system('wget ' + os.path.join(github_data_path,'train_binary_labels.csv'))
os.system('wget ' + os.path.join(github_data_path,'train_multiclass_labels.csv'))
os.system('wget ' + os.path.join(github_data_path,'val_binary_labels.csv'))
os.system('wget ' + os.path.join(github_data_path,'val_multiclass_labels.csv'))

In [None]:
### Unzip the image data and populate dataframe objects ###
# Unzip all files to respective folders in current directory
zip_ref = zipfile.ZipFile('Mamm_Images_Train.zip', 'r')
zip_ref.extractall()
zip_ref.close()

zip_ref = zipfile.ZipFile('Mamm_Images_Val.zip', 'r')
zip_ref.extractall()
zip_ref.close()

zip_ref = zipfile.ZipFile('Mamm_Images_Test.zip', 'r')
zip_ref.extractall()
zip_ref.close()  


### Store Images and Labels in Numpy Arrays and Dataframes

The following cell will load the images and labels into a pair of numpy ndarray and pandas dataframe objects.

In [None]:
# Use pd.read_csv to open csv file contents to pandas dataframes
train_binary_labels_df = pd.read_csv('train_binary_labels.csv', header=None)
train_multiclass_labels_df = pd.read_csv('train_multiclass_labels.csv', header=None)
val_binary_labels_df = pd.read_csv('val_binary_labels.csv', header=None)
val_multiclass_labels_df = pd.read_csv('val_multiclass_labels.csv', header=None)

# Concatenate train/validation labels into one set
#(you may decide to separate later for your own train/val/mocktest split)
train_binary_labels_df = pd.concat([train_binary_labels_df, val_binary_labels_df], axis=0, ignore_index = True)
train_multiclass_labels_df = pd.concat([train_multiclass_labels_df, val_multiclass_labels_df], axis=0, ignore_index = True)

# Add columns to train_binary_labels_df dataframe that contains unique
# (original) indices of train data
unique_indices_df = pd.DataFrame(list(range(5500)))
train_binary_labels_df = pd.concat([train_binary_labels_df, unique_indices_df], axis=1)
train_binary_labels_df.columns = ['Label', 'Unique_Index']
train_multiclass_labels_df = pd.concat([train_multiclass_labels_df, unique_indices_df], axis=1)
train_multiclass_labels_df.columns = ['Label', 'Unique_Index']

# Load images from file and save to both
# 1) dataframe objects
# 2) numpy arrays of shape (num_examples, num_pixels_wide, num_pixels_high)

# Read in train images (from both train and val directories) to ndarray
train_images = np.zeros((5500,299,299), dtype=np.uint8)
for ind in range(5000):
  im = plt.imread('Mamm_Images_Train/image' + str(ind) + '.jpg')
  train_images[ind, :, :] = im
  
for ind in range(500):
  im = plt.imread('Mamm_Images_Val/image' + str(ind) + '.jpg')
  train_images[ind + 5000, :, :] = im

# Read in test images to ndarray
test_images = np.zeros((1500,299,299), dtype=np.uint8)
for ind in range(1500):
  im = plt.imread('Mamm_Images_Test/image' + str(ind) + '.jpg')
  test_images[ind, :, :] = im

# Make dataframes that contain the same information as the ndarrays
# The N-th row contains a 299x299 ndarray, and a unique index for that image
train_images_df = pd.DataFrame([[train_images[i,:,:] for i in range(train_images.shape[0])]])
train_images_df = train_images_df.transpose()
unique_indices_df = pd.DataFrame(list(range(5500)), dtype = np.int)
train_images_df = pd.concat([train_images_df, unique_indices_df], axis = 1)
train_images_df.columns = ['Images', 'Unique_Index']

test_images_df = pd.DataFrame([[test_images[i,:,:] for i in range(test_images.shape[0])]])
test_images_df = test_images_df.transpose()
unique_indices_df = pd.DataFrame(list(range(5500, 7000)), dtype = np.int)
test_images_df = pd.concat([test_images_df, unique_indices_df], axis = 1)
test_images_df.columns = ['Images', 'Unique_Index']



In [None]:
import skimage.transform as image_transform

# Grab the original shape of the images
shape_train_images = train_images.shape
num_images = shape_train_images[0]

# Specify a desired resizing shape.
# NOTE: This can be modified to trade-off between training-speed and performance
resize_shape = (150, 150)

# Initialize an array for storing the resized images
training_images = np.zeros((num_images,) + resize_shape)

# Loop over each image in the data and perform a resizing operation
for img_num in range(num_images):
    training_images[img_num] = image_transform.resize(train_images[img_num, :, :], resize_shape)

train_images = training_images


shape_test_images = test_images.shape
num_images_test = shape_test_images[0]

# Specify a desired resizing shape.
# NOTE: This can be modified to trade-off between training-speed and performance
resize_shape = (150, 150)

# Initialize an array for storing the resized images
testing_images = np.zeros((num_images_test,) + resize_shape)

# Loop over each image in the data and perform a resizing operation
for img_num in range(num_images_test):
    testing_images[img_num] = image_transform.resize(train_images[img_num, :, :], resize_shape)

test_images = testing_images
# Remove the full-resolution versions from memory (just glogging things up)
del shape_train_images
del num_images 
del training_images
del shape_test_images
del num_images_test 
del testing_images


# Example binary CNN classifier

In [None]:
#data normalization
if train_images[0,::].max() > 1:
    train_images = train_images.astype(np.float32)/255.0 # <-- Apply normalization on this line
if test_images[0,::].max() > 1:
    test_images = test_images.astype(np.float32)/255.0 # <-- Apply normalization on this line


#add an axis, then convert from grayscale to rgb
train_images = train_images[:,:,:,np.newaxis]
test_images = test_images[:,:,:, np.newaxis]
original_test = tf.constant(test_images)
original = tf.constant(train_images)
converted = tf.image.grayscale_to_rgb(original)
converted_test = tf.image.grayscale_to_rgb(original_test)
train_images = converted.numpy()
test_images = converted_test.numpy()
train_images.shape
del original
del converted
del original_test
del converted_test
print(test_images.shape)

In [None]:
### Create one-hot labels
train_binary_labels = train_binary_labels_df["Label"]

#convert labels to onehot, ensure type is float32
train_binary_labels = tf.keras.utils.to_categorical(train_binary_labels, 2).astype(np.float32)

In [None]:
### Shuffle and partition labeled data
train_images_shuffled, train_binary_labels_shuffled = shuffle(train_images, train_binary_labels, random_state = 25, stratify = train_multiclass_labels_df["Label"])

val_size = 1000
mocktest_size = 500

mocktest_data = train_images_shuffled[0:mocktest_size, :, :]
mocktest_binary_labels = train_binary_labels_shuffled[0:mocktest_size, :]

val_data = train_images_shuffled[mocktest_size:mocktest_size+val_size, :, :]
val_binary_labels = train_binary_labels_shuffled[mocktest_size:mocktest_size+val_size, :]


partial_train_data = train_images_shuffled[mocktest_size+val_size:,:,:]
tr_binary_labels = train_binary_labels_shuffled[mocktest_size+val_size:,:]


#creating tf objects and deleting np arrays so less memory no 2 copies in aray.
mocktest_tf = tf.constant(mocktest_data, dtype=tf.float16)
val_data_tf = tf.constant(val_data, dtype=tf.float16)
partial_train_data_tf = tf.constant(partial_train_data, dtype=tf.float16)

del mocktest_data, val_data, partial_train_data, train_images

mocktest_binary_labels_tf = tf.constant(mocktest_binary_labels, dtype=tf.float16)
val_binary_labels_tf = tf.constant(val_binary_labels, dtype=tf.float16)
tr_binary_labels_tf = tf.constant(tr_binary_labels, dtype=tf.float16)

del mocktest_binary_labels, val_binary_labels, tr_binary_labels



In [None]:
#data augmentation layers
data_augmentation = keras.Sequential(
    [
        layers.experimental.preprocessing.RandomFlip("horizontal"),
        layers.experimental.preprocessing.RandomRotation(0.3),
        layers.experimental.preprocessing.RandomContrast(1.0),
        layers.experimental.preprocessing.PreprocessingLayer()
    ]
)

In [None]:
#inceptionv3
base_model = tf.keras.applications.InceptionV3(input_shape=(partial_train_data_tf.shape[1:]),
                                               include_top=False,
                                               weights='imagenet')
base_model.trainable = False
inputs = keras.Input(shape=(150, 150, 3))
x = data_augmentation(inputs)


# Define some functions for experimenting with unfreezing layers
def unfreeze_layers(model, top_n_layers):
    """Makes the top layers of the `base_model` trainable (except batch norm)"""
    # We unfreeze the top layers while leaving BatchNorm layers frozen
    for layer in model.layers[-top_n_layers:]:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True

def freeze_layers(model, top_n_layers):
    """Makes the top layers of the `base_model` untrainable"""
    # We unfreeze the top layers while leaving BatchNorm layers frozen
    for layer in model.layers[-top_n_layers:]:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = False




In [None]:
# layers defined
prediction_layer1 = tf.keras.layers.Dense(32, activation='relu')
#d_r = .5
#drop_out_layer = tf.keras.layers.Dropout(rate=d_r)
prediction_layer2 = tf.keras.layers.Dense(2, activation='softmax')
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

# Build the final model as a sequence:
# input --> feature extractor --> global pooling --> classifier --> predictions
model = tf.keras.Sequential([
    base_model,
    global_average_layer,
    prediction_layer1,
    #drop_out_layer,
    prediction_layer2
])

#playing with unfreezing layers
n_layers = 15
unfreeze_layers(model, n_layers)

# Specify the loss function to use
loss_func = tf.keras.losses.binary_crossentropy

# Use the RMSprop learning algorithm to optimize the network weights
base_learning_rate = 0.0001
opt = tf.keras.optimizers.RMSprop(lr=base_learning_rate)

# Compile the model using the specified loss function and potimizer
model.compile(loss=loss_func, optimizer=opt, metrics=['accuracy'])
model.summary()

In [None]:
# This function is called after each epoch
# (It will ensure that your training process does not consume all available RAM)

#keep getting errors here:     ValueError: Input 0 of layer conv2d_188 is incompatible with the layer: expected axis -1 of input shape to have value 3 but received input with shape [None, 75, 75, 1]

class garbage_collect_callback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    gc.collect()

# Time how long it takes the model to train for these epochs
start_time = time.time()

# Perform the training method
history = model.fit(partial_train_data_tf,
                    tr_binary_labels_tf,
                    batch_size=32,
                    epochs= 12,
                    verbose=True,
                    validation_data=(val_data_tf, val_binary_labels_tf),
                    callbacks = [garbage_collect_callback(), tf.keras.callbacks.EarlyStopping(patience = 5, restore_best_weights=True)])


stop_time = time.time()
print("--- %s seconds ---" % (stop_time - start_time))

In [None]:
# Plot model train/validation accuracy and model train/validation loss
# Summarize history for accuracy
plt.figure(figsize=(10,8))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# Summarize history for loss
plt.figure(figsize=(10,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Example code showing calculation of evaluation metrics (binary classifier)

In [None]:
ground_truth_onehot = mocktest_binary_labels_tf
ground_truth_labels = np.array([np.argmax(ground_truth_onehot[i, :])
                               for i in range(mocktest_binary_labels_tf.shape[0])])

predicted_onehot = model.predict(mocktest_tf)
predicted_labels = np.array([np.argmax(predicted_onehot[i, :]) 
                            for i in range(predicted_onehot.shape[0])], dtype = np.int8) 

##--Confusion Matrix Score for Binary Classifier--
from sklearn.metrics import plot_confusion_matrix
binary_confmat_weights = np.array([[2, -3],
                                  [-6, 2]])
confmat = confusion_matrix(ground_truth_labels, predicted_labels)
print(confmat)

# Simply multiply two matrices elementwise, and then sum all the results
confmat_score = np.sum(confmat * binary_confmat_weights)

#--ROCAUC for Binary Classifier--
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(ground_truth_labels, predicted_labels, pos_label=1)
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)    
print(f'Confusion Matrix Score: {confmat_score}, ROCAUC: {roc_auc:.3f}')


## Example code showing calculation of evaluation metrics (multiclass classifier)

In [None]:
# Use ground_truth_labels and predicted_labels to calculate confusion matrix
#
# Note: When your model is tested, the ground_truth_labels will be the test_labels
# and the predicted_labels will be your models predicted labels on the test set.
# To demonstrate the calculation however, we will use other data to populate 
# ground_truth_labels and predicted_labels.
#
# Variable definitions
#-------------------------------------
# ground_truth_labels - 1D array where ith element is an integer representing 
# the true label of example i
#
# predicted_labels - 1D array where ith element is an integer representing the
# predicted class of example i

# Because a classifier wasn't trained for the multiclass problem (in the starter
# code anyway) we will quickly define ground_truth_labels and predicted_labels
# with dummy values

ground_truth_labels = np.array([i for i in range(1500)])//300 #Labels range from 0 to 4
ground_truth_labels_onehot = np.eye(5)[ground_truth_labels.astype(np.uint8)]

predicted_labels = np.zeros((1500,), dtype = np.uint8) #prediction is always class 0 (this is a very naiive predictor)
prediction_scores = np.eye(5)[predicted_labels] #Row i of this matrix is the likelyhood 'score' for example i in each of the classes

In [None]:
#--Confusion Matrix Score for Multiclass Classifier--
from sklearn.metrics import plot_confusion_matrix
multiclass_confmat_weights = np.array([[2,-1,-1,-3,-3],
                                    [-2,2,-1,-3,-3],
                                    [-2,-1,2,-3,-3],
                                    [-6,-4,-4,2,-2],
                                    [-6,-4,-4,-2,2]])

confmat = confusion_matrix(ground_truth_labels, predicted_labels)
  
confmat_score = np.sum(confmat * multiclass_confmat_weights)


      
#--ROCAUC for Multiclass Classifier--
  # ROCAUC for multiclass classifiers are calculated by treating each class as a binary classification (this class or all other classes) and micro-averaging 
  # each class's ROCAUC to get a total ROCAUC
false_positive_rate = dict()
true_positive_rate = dict()
roc_auc = dict()
for i in range(5):  # looping through each of 5 classes and getting false and true positive rates and calculating ROCAUC for each class
  false_positive_rate[i], true_positive_rate[i], _ = metrics.roc_curve(ground_truth_labels_onehot[:,i], prediction_scores[:,i]) # Because labels are not binary, 
  roc_auc[i] = metrics.auc(false_positive_rate[i], true_positive_rate[i])                                                    # they need to be One Hot Vectors (ex. 2 = [0,0,1,0,0])
                                                                                                                     # prediction_scores are the predicted 'probability'
                                                                                                                     # that the example belongs to each of the 5 classes
false_positive_rate["micro"], true_positive_rate["micro"], _ = metrics.roc_curve(ground_truth_labels_onehot.ravel(), prediction_scores.ravel())
roc_auc = metrics.auc(false_positive_rate["micro"], true_positive_rate["micro"]) # micro-averaging averages the ROCAUCs from each class weighted 
                                                                         # according to number of example in that class
   
print(f'Multiclass Confusion Matrix Score: {confmat_score}, c: {roc_auc:.3f}')

# Submitting your model

Once you have finished creating your classifier, you need to use it to classify the test images and make a pickle file containing your models predictions stored as a pandas dataframe. Instructions for how to build the pandas dataframe containing your model's predictions are included below. (Note that the process is demonstrated for the example classifier in the next code cell to help clarify these instructions.)

* Create pandas dataframe with 4 columns (or 7 columns for complex lables). One column should contain your model's prediction for each image (represented as an integer) and one column should contain the unique index for that image in the test data (recall that test data indices range from 5500 to 5999). The other columns in the dataframe should contain the probability that the image is classified in each class. **Make sure the unique indices on your dataframe match the unique indices of the images being predicted. The order of rows and columns do not matter, as long as there are 1500 rows and either 4 or 7 columns (depending on which classifier you have made)**. (The below cell demonstrates how to build this dataframe created using the example classifier. You can refer to this code for more clarification of these instructions.)
 * If your classifier does not compute the probabilities for each class and simply outputs an integer label for the prediction (e.g. 0, 1, 2, 3, or 4 for multiclass classification), the probability for the predicted class should be 1.00 and the probability for all other class(es) should be 0.00 in the dataframe.
* Download the pandas dataframe as a pickle file using pickle.dump() and files.download(). (See below cell for clarification on this procedure.)
* Submit your pickle file to the submissions channel on the Medlytics 2020 discord channel. You can submit up to 3 times before your final evaluation.

Your model will be evaluated on confusion matrix score (explained below), area under the ROC curve (ROCAUC), and creativity.


# Example code for saving model predictions on the test set (for submission)

In [None]:
### Use the created neural network model to create a set of predictions for the test set

#Refer to the dataframe produced by this cell to clarify correct formatting when
#submitting pickle files for week 3 challenge (This cell produces dataframe
#using the example CNN classifier, but the same dataframe should be produced for
#any type of classifier.)

#use model.predict to create a set of predictions (onehot encoded)
test_predictions_onehot = model.predict(test_images[:,:,:,:])

def create_test_predictions_df_for_submission(test_predictions_onehot):
    test_predictions_df = pd.DataFrame(test_predictions_onehot)

    #add a column to dataframe containing the unique index for each sample
    test_predictions_df = pd.concat((test_predictions_df, test_images_df["Unique_Index"]), axis = 1)

    #add another column to dataframe with an integer representing the guessed class for that example
    #(Note this is the number corresponding to the class with highest probability score)
    guesses = [np.argmax(test_predictions_onehot[i,:]) for i in range(test_predictions_onehot.shape[0])]
    test_predictions_df = pd.concat((test_predictions_df, pd.DataFrame(guesses, columns = ['predicted_class'])), axis = 1)

    return test_predictions_df

test_predictions_df = create_test_predictions_df_for_submission(test_predictions_onehot)
print('This is what the appropriately formatted dataframe should look like:')
print('(4 columns total for binary classification, and 7 for multiclass classification)\n')
print(test_predictions_df.head())
print('\n\n')

fname = 'what_have_the_romans_ever_done_for_us.p'
pickle.dump(test_predictions_df, open(fname, 'wb')) # Creates pickle file in Google Colab but doesn't save to your machine
    
# Saving to pickle file to local drive (should be in Downloads). 
# Submit via the Medlytics 2020 "submissions" channel on Discord
files.download(fname) 

# Troubleshooting tips

- If you try to plot the images using matplotlib  you may notice that it will invert the grayscale values when it plots. You can fix this by plotting 255 - pixel_values or by adding the argument `cmap='gray'` to your imshow function (as I did when visualizing the images).

- If you get an "ResourcesExhausted" error, this means you have used all the GPU resources dedicated to your session and you will need to restart your runtime (Runtime > Restart Runtime). After restarting your runtime your environment will be wiped so you will need to reload the data.

- If you get a warning window that says you are close to reaching the session's memory limit, this means you are nearing a ResourcesExhausted error. You can still run code until the error occurs, but note that you will likely need to restart your runtime soon.

- ResourcesExhausted errors will only occur if you're using GPU support.

- Google Colab will end your runtime automatically after 24 hours

- If things seem really slow, double check that you're using the GPU

- If you are close to using all available ram, try to double check the memory being consumed by different variables using the provided cell below

In [None]:
# If you find yourself running out of ram below is a useful method to print out 
# the sizes of the variables in your workspace

import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

type(train_images[0,:,:])