## Initial setup

**Folders needed to be created before running the notebook: best_classify_models_hyperparameter and classify_scores_hyperparameter**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
print(tf.__version__)
import torch
print(torch.__version__)
import matplotlib
print(matplotlib.__version__)

2.8.0
1.10.0+cu111
3.2.2


In [None]:
!nvidia-smi

Fri Jan 21 19:41:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Other imports
! pip install tensorflow_addons
! pip install tensorflow_io

import os
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from matplotlib.ticker import MultipleLocator, FormatStrFormatter, AutoMinorLocator
from imutils import paths
from tqdm import tqdm
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import tensorflow_io as tfio
import tensorflow_hub as hub
import numpy as np
import cv2
import pandas as pd
import seaborn as sns
from scipy.stats import mannwhitneyu
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import sklearn.manifold
from sklearn.metrics.pairwise import cosine_similarity as cos
from sympy.utilities.iterables import multiset_permutations
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import *
from sklearn.preprocessing import StandardScaler
from IPython.display import Image, display


import zipfile
import concurrent.futures

# Random seed fix
random_seed = 42
tf.random.set_seed(random_seed)
np.random.seed(random_seed)

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.16.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 6.4 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.16.1
Collecting tensorflow_io
  Downloading tensorflow_io-0.24.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.4 MB)
[K     |████████████████████████████████| 23.4 MB 1.8 MB/s 
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.24.0


## Dataset gathering and preparation

In [None]:
training_batch_size = 4

BATCH_SIZE = training_batch_size

imageSize = 224

category_names = ['bundle', 'dispersed', 'network', 'singular']
color_method = ['C0', 'C1', 'C2', 'C3', 'C4']
color = ['black', 'magenta', 'cyan', 'yellow']
marker = ['o', 's', '<', '>', '^']
seaborn_palette = sns.color_palette("colorblind")

In [None]:
np.random.seed(random_seed)
peptide_morph_train_path = "/content/drive/MyDrive/TEM image datasets/2022-nanowire-morphology"
peptide_morph_images_train = list(paths.list_files(basePath=peptide_morph_train_path, validExts='jpg'))
peptide_morph_images_train = np.random.choice(np.array(peptide_morph_images_train), len(peptide_morph_images_train), replace=False)
print(len(peptide_morph_images_train))

400


In [None]:
train_labels = []
for i in range(peptide_morph_images_train.shape[0]):
  train_label = peptide_morph_images_train[i].split("/")[-2]
  train_labels.append(train_label)
le = LabelEncoder()
peptide_morph_train_enc = le.fit_transform(train_labels)

In [None]:
# Image preprocessing utils
@tf.function
def parse_images(image_path):
    image_string = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image_string, channels=3)
    # image = tfio.experimental.image.decode_tiff(image_string)[:, :, :3]   # in the doc, it transforms tiff to 4 channels, with additional channel of opacity which is not needed.
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, size=[imageSize, imageSize])

    return image

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices(peptide_morph_images_train)
train_ds = (
    train_ds
    .map(parse_images, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # .shuffle(200)
    .batch(training_batch_size
          #  , drop_remainder=True
           )
    .prefetch(tf.data.experimental.AUTOTUNE)
)

datagen = tf. keras.preprocessing.image.ImageDataGenerator(preprocessing_function=tf.keras.applications.resnet50.preprocess_input)

## Initiate self-supervised models

In [None]:
Resnet50_transfer = tf.keras.applications.ResNet50(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(imageSize, imageSize, 3), 
    pooling=None,
)

Resnet50_transfer.trainable = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
# Resnet as backbone
def get_resnet_self_supervise_model(hidden_1, hidden_2, hidden_3):
    base_model = Resnet50_transfer
    base_model.trainable = True
    inputs = Input((imageSize, imageSize, 3))
    h = base_model(inputs, training=True)
    h = GlobalAveragePooling2D()(h)

    projection_1 = Dense(hidden_1)(h)                                        
    projection_1 = Activation("relu")(projection_1)
    projection_1 = BatchNormalization(epsilon=0.001)(projection_1)
    projection_2 = Dense(hidden_2)(projection_1)
    projection_2 = Activation("relu")(projection_2)
    projection_2 = BatchNormalization(epsilon=0.001)(projection_2)
    projection_3 = Dense(hidden_3)(projection_2)
    projection_3 = BatchNormalization(epsilon=0.001)(projection_3)

    resnet_model = Model(inputs, projection_3)
    
    return resnet_model

## Initiate downstream classification model

In [None]:
def get_linear_model(features):                                                                                  
    linear_model = Sequential([                                                                                  
			                              Input(shape=(features,)),
		                                Dense(4, activation="softmax")])
    return linear_model

## hyperparameter tuning with cross-validation

In [None]:
# Random seed fix
random_seed_list = np.array([42, 43, 44, 45, 46])

## hyperparameters ss stands for self-supervise

# instead of having for loop for all the hyperparameter choices, we manually 
# change the parameters and ran cross-validation at each hyperparameter set.
# Considering that Google Colab have time-out protocols that limit long-time usage
# of GPU, dividing our hyperparameter sweep into smaller segments resulted in a more
# flexible and productive workflow.

ss_method = 'barlow'
ss_backbone = 'resnet'
ss_batch_size = np.array([16])
phl_1 = np.array([128])
phl_2 = np.array([64])
phl_3 = np.array([1024])

earlystop_criterion = EarlyStopping(monitor='val_accuracy', patience=20, verbose=0, mode='auto', restore_best_weights=True)
adam = tf.keras.optimizers.Adam(learning_rate=0.001)
metrics = ['accuracy']
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

In [None]:
linear_scores = np.zeros((len(random_seed_list), len(random_seed_list), 4, 4))

for i in range(len(random_seed_list)):
  resnet_model = get_resnet_self_supervise_model(phl_1, phl_2, phl_3)
  resnet_model.load_weights('%s_%s_batch%i_project%i_%i_%i_res384_seed%i.h5' 
                          % (ss_method, ss_backbone, ss_batch_size, phl_1, phl_2, phl_3, random_seed_list[i]))
  resnet_model.layers[1].trainable = False

  feature_extraction_model = Model(resnet_model.input, resnet_model.layers[-9].output)

  # Extract train and test features
  features = feature_extraction_model.predict(train_ds)

  for j in range(len(random_seed_list)):
    TRAIN_feature, test_feature, TRAIN_label, test_label = train_test_split(features, peptide_morph_train_enc, test_size=0.2, shuffle=True, stratify=peptide_morph_train_enc, random_state=random_seed_list[j])
    k = 0
    for train_ix, test_ix in cv.split(TRAIN_feature, TRAIN_label):
      k += 1
      train_feature, val_feature = TRAIN_feature[train_ix, :], TRAIN_feature[test_ix, :]
      train_label, val_label = TRAIN_label[train_ix], TRAIN_label[test_ix]
      checkpoint_model_linear = ModelCheckpoint('best_classify_models_hyperparameter/%s_%s_batch%i_project%i_%i_%i_res384_seed%i_seed%i_fold%i_linear.h5' 
                                  % (ss_method, ss_backbone, ss_batch_size, phl_1, phl_2, phl_3, random_seed_list[i], random_seed_list[j], k),
                                  monitor='val_accuracy', mode='auto', verbose=0, save_best_only=True, save_weights_only=True)
      # train linear classifier model
      linear_model = get_linear_model(train_feature.shape[1])
      linear_model.compile(loss="sparse_categorical_crossentropy", metrics=metrics, optimizer=adam)
      linear_history = linear_model.fit(train_feature, train_label, validation_data=(val_feature, val_label), 
                                            batch_size=training_batch_size, epochs=300, workers=8, use_multiprocessing=True, 
                                            verbose=1, callbacks=[earlystop_criterion, checkpoint_model_linear])
      # log best classification model performance
      linear_model = get_linear_model(train_feature.shape[1])
      linear_model.load_weights('best_classify_models_hyperparameter/%s_%s_batch%i_project%i_%i_%i_res384_seed%i_seed%i_fold%i_linear.h5' 
                                  % (ss_method, ss_backbone, ss_batch_size, phl_1, phl_2, phl_3, random_seed_list[i], random_seed_list[j], k))
      y_pred_linear = np.argmax(linear_model.predict(val_feature), axis=-1)
      linear_scores[i, j, k - 1] = np.array([accuracy_score(y_pred_linear, val_label), 
                                             precision_score(y_pred_linear, val_label, average='weighted'), 
                                             recall_score(y_pred_linear, val_label, average='weighted'),
                                             f1_score(y_pred_linear, val_label, average='weighted')])
      
np.savez_compressed('classify_scores_hyperparameter/%s_%s_batch%i_project%i_%i_%i.npz' 
                            % (ss_method, ss_backbone, ss_batchsize, phl_1, phl_2, phl_3),
                    scores=linear_scores)

print('average classification accuracy precision recall f1_score')
print(np.average(linear_scores, axis=(0,1,2)))
print('standard deviation')
print(np.std(linear_scores, axis=(0,1,2)))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoc