# Start

In [1]:
import cv2 as cv
import albumentations as A
import os
import sys
import datetime
import io

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Input, BatchNormalization, Layer, Dropout, Resizing, Rescaling, RandomFlip, RandomRotation
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import BinaryAccuracy, FalsePositives, FalseNegatives, TruePositives, Accuracy,TrueNegatives, AUC, Precision, Recall
from tensorflow.keras.callbacks import Callback, CSVLogger, EarlyStopping, LearningRateScheduler, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import L2, L1
import tensorflow_probability as tfp
from tensorboard.plugins.hparams import api as hp

import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve

# !pip install -U wandb
import wandb
# !wandb login

# from google.colab import drive
# drive.mount('/content/drive')
# data_dir = '/content/drive/MyDrive/tfds_data/'

dataset, dataset_info = tfds.load(
    "malaria",
    with_info=True,
    as_supervised=True,
    shuffle_files=True,
    split=["train"],
    # data_dir=data_dir,  # Use Google Drive for storage
)

print(dataset)
# print(dataset_info)

  check_for_updates()
2024-11-02 19:52:16.457290: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730587936.844525     949 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730587936.922119     949 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-02 19:52:17.979688: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[<_PrefetchDataset element_spec=(TensorSpec(shape=(None, None, 3), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>]


I0000 00:00:1730587956.980262     949 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2865 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1


In [2]:
RAW_DIR="./raw_dataset"
DOWNLOAD_DIR = "./artifacts"
PREPROCESSED_DIR_1 = "./preprocessed_1"

In [None]:
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PREPROCESSED_DIR_1, exist_ok=True)

# raw_dataset

In [None]:
# Generate Raw Dataset
for i, (image, label) in enumerate(dataset[0]):
  with open(f"{RAW_DIR}/raw_dataset_{i}_.npz", "wb") as file:
    # saving image and label within the same *.npz file.
    # np.savez(file, image=image.numpy(), label=label.numpy()) # NOTE: too large
    np.savez_compressed(file, image=image.numpy(), label=label.numpy())
    if i % 5000 == 0:
      print(i, end=" | ")

0 | 5000 | 10000 | 15000 | 20000 | 25000 | 

2024-11-02 14:44:01.093228: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [22]:
with open(f"{RAW_DIR}/raw_dataset_5_.npz", "rb") as file:
  # p = np.load(file, allow_pickle=True).f.arr_0
  # p = np.load(file, allow_pickle=True)["arr_0"]
  obj = np.load(file, allow_pickle=True)
  print(obj["image"][0][0], obj["label"])

[0 0 0] 0


In [23]:
with wandb.init(project="Malaria-Detection", entity="albertalvin8080-academic") as run:
    artifact = wandb.Artifact(
        name="new_dataset",
        type="raw_dataset",
        description=(
            "The Malaria dataset contains a total of 27,558 cell images with equal instances of parasitized and uninfected cells from the thin blood smear slide images of segmented cells."
        ),
        metadata={
            "source": "TFDS",
            "homepage": "https://lhncbc.nlm.nih.gov/publication/pub9932",
            "source_code": "tfds.image_classification.Malaria",
            "version": "1.0.0",
            "download_size": "~337.08 MiB",
        }
    )
    artifact.add_dir(RAW_DIR)
    artifact.save()
    # run.log_artifact(artifact)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011116347533334191, max=1.0…

[34m[1mwandb[0m: Adding directory to artifact (./raw_dataset)... Done. 100.9s


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
with wandb.init(project="Malaria-Detection", entity="albertalvin8080-academic") as run:
    artifact = run.use_artifact('albertalvin8080-academic/Malaria-Detection/new_dataset:v0', type='raw_dataset')
    train_artifact_dir = artifact.download(root=DOWNLOAD_DIR)

[34m[1mwandb[0m: Downloading large artifact new_dataset:v0, 387.08MB. 27558 files... 
[34m[1mwandb[0m:   27558 of 27558 files downloaded.  
Done. 0:3:34.2


# preprocessed_dataset

In [25]:
IMG_SIZE = 224

@tf.function
def resize_and_normalize(image, label):
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE)) / 255.0
    return image, label

In [None]:
train_artifact_dir = "/new_dataset:v0"
# images = [] # NOTE: just for you to remember, it's NOT a good idea to put 27000 images in a list at runtime.
# labels = []

for i, file_name in enumerate(os.listdir(DOWNLOAD_DIR + train_artifact_dir)[:1200]):
    # print(DOWNLOAD_DIR + artifact_name + "/" + file_name)
    
    with open(DOWNLOAD_DIR + train_artifact_dir + "/" + file_name, mode="rb") as f:
        obj = np.load(f, allow_pickle=True)
        image, label = obj["image"], obj["label"]
        image, label = resize_and_normalize(image, label)
        # print(image[0, 0], label)
        
    # NOTE: This assures that both image and label will still be placed togheter, but it doesn't 
    # assure the same ordering from the same dataset due to hot the OS treats filenames like:
    # - 'raw_dataset_0_.npz',
    # - 'raw_dataset_10000_.npz',
    # - 'raw_dataset_2_.npz',
    with open(PREPROCESSED_DIR_1 + "/" + f"preprocessed_{i}_.npz", "wb") as f:
        # np.savez_compressed(f, image=image, label=label)
        # NOTE: files will probably be large due to floating point.
        np.savez_compressed(f, image=image.numpy(), label=label.numpy())
    
    if (i+1) % 300 == 0:
        print(i, end="\r", flush=True)

1199

In [None]:
with wandb.init(project="Malaria-Detection", entity="albertalvin8080-academic") as run:
    # NOTE: This line is important to link togheter the artifacts.
    new_dataset_artifact = run.use_artifact('albertalvin8080-academic/Malaria-Detection/new_dataset:v0', type='raw_dataset')
    artifact = wandb.Artifact(name="preprocessed_full_dataset", type="preprocessed_dataset")
    artifact.add_dir(PREPROCESSED_DIR_1)
    artifact.save()

[34m[1mwandb[0m: Adding directory to artifact (./preprocessed_1)... Done. 5.2s


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

# train_dataset, val_dataset and test_dataset

In [None]:
with wandb.init(project="Malaria-Detection", entity="albertalvin8080-academic") as run:
    preprocessed_full_artifact = run.use_artifact(
        "albertalvin8080-academic/Malaria-Detection/preprocessed_full_dataset:v0",
        type="preprocessed_dataset",
    )
    preprocessed_full_artifact.download()

[34m[1mwandb[0m: Downloading large artifact preprocessed_full_dataset:v0, 368.05MB. 1200 files... 
[34m[1mwandb[0m:   1200 of 1200 files downloaded.  
Done. 0:0:16.3


In [None]:
with wandb.init(project="Malaria-Detection", entity="albertalvin8080-academic") as run:
    # NOTE: This line is important to link togheter the artifacts.
    preprocessed_full_artifact = run.use_artifact(
        "albertalvin8080-academic/Malaria-Detection/preprocessed_full_dataset:v0",
        type="preprocessed_dataset",
    )

    train_artifact = wandb.Artifact(
        name="train_dataset",
        type="preprocessed_dataset",
        description="Training dataset",
    )
    val_artifact = wandb.Artifact(
        name="val_dataset",
        type="preprocessed_dataset",
        description="Validation dataset",
    )
    test_artifact = wandb.Artifact(
        name="test_dataset", type="preprocessed_dataset", description="Testing dataset"
    )

    train_ratio = 0.8
    val_ratio = 0.1
    test_ratio = 0.1

    dir_name = "artifacts/preprocessed_full_dataset:v0/"
    files = os.listdir(dir_name)
    files_len = len(files)

    train_files = files[0 : int(files_len * train_ratio)]
    val_files = files[
        int(files_len * train_ratio) : int(files_len * (train_ratio + val_ratio))
    ]
    test_files = files[int(files_len * (train_ratio + val_ratio)) : files_len]

    print(len(train_files), len(val_files), len(test_files))

    for file_name in train_files:
        train_artifact.add_file(dir_name + "/" + file_name)
    for file_name in val_files:
        val_artifact.add_file(dir_name + "/" + file_name)
    for file_name in test_files:
        test_artifact.add_file(dir_name + "/" + file_name)

    train_artifact.save()
    val_artifact.save()
    test_artifact.save()

# augment_dataset

In [3]:
with wandb.init(project="Malaria-Detection", entity="albertalvin8080-academic") as run:
    artifact = run.use_artifact(
        "albertalvin8080-academic/Malaria-Detection/train_dataset:v0",
        type="preprocessed_dataset",
    )
    artifact.download()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malbertalvin8080[0m ([33malbertalvin8080-academic[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact train_dataset:v0, 294.18MB. 960 files... 
[34m[1mwandb[0m:   960 of 960 files downloaded.  
Done. 0:0:20.3


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [4]:
@tf.function
def augment(image, label):
    image = tf.image.random_flip_up_down(image)
    # image = tf.image.random_flip_left_right(image)
    image = tf.image.rot90(image, k=1)
    return image, label

In [7]:
with wandb.init(project="Malaria-Detection", entity="albertalvin8080-academic") as run:
    train_artifact = run.use_artifact(
        "albertalvin8080-academic/Malaria-Detection/train_dataset:v0",
        type="preprocessed_dataset",
    )

    augmented_artifact = wandb.Artifact(
        name="augmented_dataset",
        type="preprocessed_dataset",
        description="Augmented version of train_dataset.",
    )
    
    train_artifact_dir = "artifacts/train_dataset:v0/"
    augmented_dir = "./preprocessed_augmented/"
    os.makedirs(augmented_dir, exist_ok=True)
    
    files = os.listdir(train_artifact_dir)
    for i, file_name in enumerate(files):
        with open(train_artifact_dir + file_name, mode="rb") as f:
            obj = np.load(f, allow_pickle=True)
            image, label = obj["image"], obj["label"]
            image, label = augment(image, label)
        
        with open(augmented_dir + f"augmented_dataset_{i}_.npz", mode="wb") as f:
            np.savez_compressed(f, image=image.numpy(), label=label.numpy())
    
    augmented_artifact.add_dir(augmented_dir)
    augmented_artifact.save()


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011116323011111085, max=1.0…

[34m[1mwandb[0m: Adding directory to artifact (./preprocessed_augmented)... Done. 6.2s


VBox(children=(Label(value='0.161 MB of 0.559 MB uploaded\r'), FloatProgress(value=0.28902975557372823, max=1.…