<a href="https://colab.research.google.com/github/akankshakusf/Project-CNN-Deep-Learning-Malaria-Detection/blob/master/Malaria_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#import packages
import tensorflow as tf
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import tensorflow packages
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense,Flatten, InputLayer


# Data Preparation

## Data Loading

In [7]:
# Import the Malaria dataset from TensorFlow Datasets (TFDS) and split it into three parts:
# - The first 80% of the data will be used for training.
# - The next 10% (80%-90%) will be used for validation
# - The last 10% (90%-100%) will be used for testing.
# and shuffle files

dataset, dataset_info = tfds.load(
    "malaria", with_info=True, shuffle_files=True)




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/malaria/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/malaria/incomplete.P40Z3L_1.0.0/malaria-train.tfrecord*...:   0%|         …

Dataset malaria downloaded and prepared to /root/tensorflow_datasets/malaria/1.0.0. Subsequent calls will reuse this data.


Scenario: lets say you dnt pull the data out from tensorflow instead you are pull it from you .text file. below is the approuch we will take

In [20]:
def split(dataset,TRAIN_RATIO,VAL_RATIO,TEST_RATIO):
  #main dataset size
  DATASET_SIZE=len(dataset)

  #make train, val, test split
  train_ds= dataset.take(int(TRAIN_RATIO*DATASET_SIZE))
  val_test_ds= dataset.skip(int(TRAIN_RATIO*DATASET_SIZE)) #not important
  val_ds= val_test_ds.take(int(VAL_RATIO*DATASET_SIZE))
  test_ds=val_test_ds.skip(int(VAL_RATIO*DATASET_SIZE))

  #return train,val, tets dataset
  return train_ds, val_ds, test_ds

In [14]:
###### split function logic explaination builder ######

# TRAIN_RATIO=0.6
# VAL_RATIO=0.2
# TEST_RATIO=0.2
# TOTAL=10
# #set range on data for testing logic
# ds=tf.data.Dataset.range(TOTAL)
# #train_sd,val_ds,test_ds=split(ds,TRAIN_RATIO,VAL_RATIO,TEST_RATIO)
# train_ds=ds.take(int(TRAIN_RATIO*TOTAL))
# val_test_ds=ds.skip(int(TRAIN_RATIO*TOTAL)) #not important
# val_ds=val_test_ds.take(int(VAL_RATIO*TOTAL))
# test_ds=val_test_ds.skip(int(VAL_RATIO*TOTAL))

# #print main dataset for review
# print([int(x) for x in ds.as_numpy_iterator()])
# print([int(x) for x in train_ds.as_numpy_iterator()])
# print([int(x) for x in val_test_ds.as_numpy_iterator()])
# print([int(x) for x in val_ds.as_numpy_iterator()])
# print([int(x) for x in test_ds.as_numpy_iterator()])


In [21]:
TRAIN_RATIO=0.8
VAL_RATIO=0.1
TEST_RATIO=0.1

#call "split" f/c for making split
train_dataset, val_dataset, test_dataset =split(dataset["train"],TRAIN_RATIO,VAL_RATIO,TEST_RATIO)

In [22]:
# Check lengths to confirm splits
print(f"Train dataset size: {len(list(train_dataset))}")
print(f"Validation dataset size: {len(list(val_dataset))}")
print(f"Test dataset size: {len(list(test_dataset))}")

Train dataset size: 22046
Validation dataset size: 2755
Test dataset size: 2757


In [23]:
#check len of 3 dataset
print(len(train_dataset),len(train_ds))
print(len(val_dataset),len(test_ds))
print(len(test_dataset),len(test_ds))


NameError: name 'train_ds' is not defined

In [None]:
#check data info
dataset_info

So, in the data 0 represent parasitic
and 1 represents uneffected

In [None]:
# Print a few values from the validation dataset
for i in val_dataset.take(1):
    print(i)

## Dataset Visualization

In [None]:
for sample in train_dataset.take(1):
    print(sample)


# Understanding the Output
# this dataset contains a dictionary where:

# "image" is stored as dataset_element["image"]
# "label" is stored as dataset_element["label"]
# Issue:
# Your loop is unpacking incorrectly because your dataset does not return (image, label) directly.
# Instead, each element is a dictionary with "image" and "label" as keys.

In [None]:
# Loop through the first 16 images in the dataset
for i, sample in enumerate(train_dataset.take(16)):
    image = sample["image"]  # Extract the image
    label = sample["label"]  # Extract the label

    ax = plt.subplot(4, 4, i + 1)
    plt.imshow(image)
    plt.title(dataset_info.features['label'].int2str(label))  # Convert label to class name
    plt.axis("off")

plt.show()


## Data Preprocessing

### Data Augmentation

- These images are of bigger sizes (255,255,2) etc. But we will have to Normalize them and bring them in the range of 0-1 so Deep model converges or inference faster

In [None]:
#define the image size we want to reduce to
IM_SIZE=224

In [None]:
#function to perform resizing on all images
def resize_rescale(inputs):
  return tf.image.resize(inputs['image'],(IM_SIZE,IM_SIZE))/255.,inputs['label']

In [None]:
#map the data
train_dataset=train_dataset.map(resize_rescale)

In [None]:
#review one of the image testing to see if changed to IM_SIZE=224
for image,label in train_dataset.take(1):
  print(image,label)

tf.Tensor(
[[[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 ...

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]], shape=(224, 224, 3), dtype=float32) tf.Tensor(1, shape=(), dtype=int64)


* see that image size is now 224 : of shape shape=(224, 224, 3), dtype=float32) tf.Tensor(1, shape=(), dtype=int64)
- tf.Tensor(1, meaning its was not infected cell

In [None]:
#check for unique values in transformed train_dataset
np.set_printoptions(suppress=True, precision=6)
np.unique(image)

array([0.      , 0.000025, 0.000026, ..., 0.850912, 0.850937, 0.85098 ],
      dtype=float32)

* Notice - here i goal is acchieved of having all values in between 0 to 1

# Model Creation and Training

In [5]:
model=tf.keras.Sequential([
                        InputLayer(input_shape=(IM_SIZE,IM_SIZE,3)),
                        Conv2D(filters=6,kernel_size=5,strides=1,padding="valid",activation="sigmoid"),
                        MaxPool2D(pool_size=2,strides=2),

                        Conv2D(filters=16,kernel_size=5,strides=1,padding="valid",activation="sigmoid"),
                        MaxPool2D(pool_size=2,strides=2),

                        Flatten(),
                        Dense(1000,activation="sigmoid"),
                        Dense(100,activation="sigmoid"),
                        Dense(2,activation="sigmoid"),

])


NameError: name 'IM_SIZE' is not defined