In [1]:
# NOTE: For the teachers (or are you called examiners?), as of now all work within this notebook is done by Alex Nordin
#
# Code Ownership Tag -> @Alex Nordin
#

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.applications import vgg16, inception_v3
from keras.optimizers import schedules
from keras.callbacks import ReduceLROnPlateau
from keras.models import load_model
from keras import utils
#install using pip3 install vit-keras.
from vit_keras import vit

2022-12-12 13:12:04.167926: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 13:12:04.485603: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-12 13:12:05.303772: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-12 13:12:05.303859: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [3]:
#Path to the directories containing our data sets. Change to wherever they are on your machine to use the notebook.Kaggle URL: https://www.kaggle.com/datasets/mustai/mushroom-12-9528
mush_alt_train = '/home/alex/Documents/AI_proj/mush_alt/train'
mush_alt_val = '/home/alex/Documents/AI_proj/mush_alt/valid'
mush_alt_test = '/home/alex/Documents/AI_proj/mush_alt/test'

In [4]:
#img_size = 180
#Had to change image size to be bigger to work with vit model. Slows down performance and requires more memory. :/
img_size = 224

#The uploader of the dataset very nicely divided it into training/validation/testing sets for us, so we just import each folder in turn.
#image_dataset_from_directory() assigns the labels for the images as the subdirectories inside the folders. very convenient
training_set = tf.keras.utils.image_dataset_from_directory(
    mush_alt_train,
    seed=123,
    image_size=(img_size, img_size)
)

validation_set = tf.keras.utils.image_dataset_from_directory(
    mush_alt_val,
    seed=123,
    image_size=(img_size, img_size)
)

test_set = tf.keras.utils.image_dataset_from_directory(
    mush_alt_test,
    seed=123,
    image_size=(img_size, img_size)
)

Found 6664 files belonging to 12 classes.


2022-12-12 13:13:05.513178: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 13:13:05.608916: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 13:13:05.609674: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-12 13:13:05.613388: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Found 953 files belonging to 12 classes.
Found 1911 files belonging to 12 classes.


In [5]:
# To get better performance over such a large data set we cache(), keeping the dataset in memory after the first epoch
# Additionally prefetch() allows us to "overlap preprocessing and model execution during training"
AUTOTUNE = tf.data.AUTOTUNE

training_set = training_set.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
validation_set = validation_set.cache().prefetch(buffer_size=AUTOTUNE)
test_set = validation_set.cache().prefetch(buffer_size=AUTOTUNE)

In [6]:
#vg16 is a pre-trained model with purported high performance an accuracy
# base_model = vgg16.VGG16(weights="imagenet", 
#     include_top=False,
#     input_shape=(img_height, img_width, 3))

# Inception_v3 is apparently also a good pre-trained mode. 
# base_model = inception_v3.InceptionV3(
#     include_top=False,
#     weights="imagenet",
#     input_shape=(img_size, img_size, 3)
# )

# #Trying our resnet50
# base_model = ResNet50(
#     include_top=False,
#     weights="imagenet",
#     input_shape=(img_size, img_size, 3)
# )

#base_model.trainable = False

#vit is a type of RNN called a transformer. it is our current best performer at ~80% accuracy on our validation and testing sets
base_model = vit.vit_b32(
    image_size=(img_size),
    activation="softmax",
    pretrained=True,
    include_top=False,
    pretrained_top=False,
    classes=12
)



In [7]:
#A few different schedulers to decay our learning rate as training progresses. Currently not in use in favor of a callback method that decreases lr when it plateus.

# scheduler = keras.optimizers.schedules.CosineDecay(
#     initial_learning_rate=0.001,
#     decay_steps=10000,
#     )

# exp_scheduler = keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate=0.0001,
#     decay_steps=1000,
#     decay_rate=0.9
#     )

# fine_scheduler = keras.optimizers.schedules.CosineDecay(
#     initial_learning_rate=0.000001,
#     decay_steps=1000,
#     )

#learning rate constants
lr = 1e-4
#fine_lr = 1e-6

In [8]:
#creates a sequential model which performs data augmentation. the things I've found increases accuracy are randomflip and randomrotation.
data_aug = Sequential([
    keras.layers.RandomFlip(mode="horizontal_and_vertical", seed=987),
    #keras.layers.RandomZoom(height_factor=(-0.2, -0.3), seed=987),
    keras.layers.RandomRotation(factor=0.3, seed=987),
    #keras.layers.RandomCrop(height=120, width=120, seed=987)
])

In [9]:
# We have twelve classes
classes = 12


#The model consists of the data augmentation model, a layer which rescales RBG values (0-255 in 3 layers) into values between 0 and 1 in three layers
#base model is out pre-trained vit model, then we flatten the output of that, and feed it into a dense layer with 512 neurons, then into our output layer with 12 probabilities
model = Sequential([
    data_aug,
    keras.layers.Rescaling(1./255),
    #======
    base_model,
    #=====
    keras.layers.Flatten(),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(classes, activation='softmax')
])

In [10]:
# Compile our model with the optimizer and loss function we want to use
model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])



In [11]:
import neptune.new as neptune

#Initiates a neptune "run", which shows up on my neptune page. comment out all the neptune code if you want to run the notebook, or add your own API token as an arg in this function call
#or add your API token as a global variable (for linux you can add it as an export in .bashrc).
run = neptune.init_run(
    project="alnor/DIT825",
    source_files=["model_notebook.ipynb"]
)

https://app.neptune.ai/alnor/DIT825/e/DIT-89
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [12]:
from neptune.new.integrations.tensorflow_keras import NeptuneCallback

#Callback that successively uploads information to neptune
neptune_callback = NeptuneCallback(run=run, base_namespace="metrics")
#Calback that keep tracks of the metric val_loss (validation loss), and reduces the learning rate if val_loss plateus. currently used instead of scheduler
reduce_learning_rate = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=5, min_delta=0.0000000001)

In [13]:
# Training the model, for 20 epochs, cross validating with our validation set. Do not run if you can't run on your GPU or it will take forever. Even running on GPU expect it to take up to 10 min
model.fit(training_set, validation_data=validation_set, epochs=30, callbacks=[neptune_callback, reduce_learning_rate])


Epoch 1/30


2022-12-06 16:32:37.003662: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8600
2022-12-06 16:32:37.853433: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7febb43b3c10>

In [6]:
# Calling the evaulate method directly on a Tensorflow Dataset works just fine
test_result_1 = model.evaluate(test_set)





2022-12-08 17:47:04.319246: W tensorflow/core/common_runtime/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 9.00MiB (rounded to 9437184)requested by op StatelessRandomUniformV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-12-08 17:47:04.319320: I tensorflow/core/common_runtime/bfc_allocator.cc:1033] BFCAllocator dump for GPU_0_bfc
2022-12-08 17:47:04.319351: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Bin (256): 	Total Chunks: 25, Chunks in use: 25. 6.2KiB allocated for chunks. 6.2KiB in use in bin. 193B client-requested in use in bin.
2022-12-08 17:47:04.319372: I tensorflow/core/common_runtime/bfc_allocator.cc:1040] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2022-12-08 17:47:04.319394: I tensorf

ResourceExhaustedError: Exception encountered when calling layer "MlpBlock_3" "                 f"(type Sequential).

{{function_node __wrapped__StatelessRandomUniformV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[768,3072] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StatelessRandomUniformV2]

Call arguments received by layer "MlpBlock_3" "                 f"(type Sequential):
  • inputs=tf.Tensor(shape=(None, 50, 768), dtype=float32)
  • training=False
  • mask=None

In [15]:
# for i, layer in enumerate(base_model.layers):
#    print(i, layer.name)

In [16]:
# Saving our model for re-use in the saved_model folder. NOTE: For Niklas, you can use model.load() on the saved model to use it in a method call in the backend
model.save("prime.h5")

run["metrics/test_accuracy"].log(test_result_1)
#run["metrics/test_accuracy_fine"].log(test_result_2)
run["my_model/saved_model"].upload("prime.h5")

In [17]:
version_2 = neptune.init_model_version(
    model="DIT-MOD",
    project="alnor/DIT825"
)

version_2["model"].upload("prime.h5")
version_2["validation/acc"] = 0.806925
version_2["testing/acc"] = 0.806925
version_2["training/acc"] = 0.999549

version_2.change_stage("production")

https://app.neptune.ai/alnor/DIT825/m/DIT-MOD/v/DIT-MOD-4
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api/model_version#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [18]:
run.stop()

Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 74 operations to synchronize with Neptune. Do not kill this process.
All 74 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/alnor/DIT825/e/DIT-89


In [7]:

init_model = load_model(
    '/home/alex/Documents/AI_proj/repo/shroom/client/saved_model/prime.h5'
    )

#init_model.predict("/home/alex/Documents/AI_proj/repo/shroom/client/media/001_jNbj1WMvR-8_e02fmlw.jpg")



In [11]:
path = "/home/alex/Documents/AI_proj/repo/shroom/001_jNbj1WMvR-8.jpg"

img = utils.load_img(
    path,
    color_mode='rgb',
    target_size=(224, 224)
)

input_arr = utils.img_to_array(img)

input_arr = np.array([input_arr])

pred = init_model.predict(input_arr)

print(pred)

[[9.8777525e-07 6.3555981e-06 7.3536000e-08 7.5086587e-07 8.5984766e-08
  1.7105616e-08 4.2874934e-08 1.1997899e-08 6.2936720e-06 1.9339440e-08
  9.9998534e-01 2.6667998e-08]]


In [9]:
history = init_model.fit(training_set, validation_data=validation_set, epochs=1)





2022-12-12 13:16:17.949635: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8600
2022-12-12 13:16:18.822961: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory




TypeError: 'History' object is not subscriptable

In [13]:
init_model.metrics_names

['loss', 'accuracy']

In [15]:
loss = history.history['loss']
acc = history.history['accuracy']

print(loss)
print(acc)

[0.005834301467984915]
[0.9993997812271118]
