## Importing Libraries

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

## Data preprocessing

In [None]:
batch_size = 32 #instead of processing one image at a time, the images are processed in a batch of 32
img_height = 180
img_width = 180

# "tf.keras.utils.image_dataset_from_directory" is a library function which loads images from a directory
# we have two directories here, test and train, each directory here has four classes
# label_mode defines how the lables are represented by the library function 

train_ds = tf.keras.utils.image_dataset_from_directory(
    r"D:\Study stuff\Fruit insights project\model\Model-1-fruit-freshness\train",
    image_size=(img_height, img_width),
    batch_size=batch_size,
    label_mode='categorical',  
    shuffle=True,
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    r"D:\Study stuff\Fruit insights project\model\Model-1-fruit-freshness\test",
    image_size=(img_height, img_width),
    batch_size=batch_size,
    label_mode='categorical',  
    shuffle=False,
)


Found 8733 files belonging to 4 classes.
Found 2570 files belonging to 4 classes.


### Normalizing pixel values

In [None]:
# we are normalizing pixel values because if data is collected from multiple resources and all images have different lighting conditions, it helps to standardize the input 

# we are creating a normalization layer, it rescales the input images by multiplying each image by 1/255. The original pixel values range from 0 to 255, and by this multiplication we convert them to floating point values between 0.0 and 1.0
normalization_layer = tf.keras.layers.Rescaling(1./255)

# here the map() applies the lambda function to each batch
# x is a batch of images
# y is a batch of labels for each image in the batch
train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
test_ds = test_ds.map(lambda x, y: (normalization_layer(x), y))


### Optimizing data set loading

In [None]:
# tf.data.AUTOTUNE makes sure the data pipeline has the right number of threads working in parallel so training is smooth and efficient.

# data pipeline includes a number of processes such as, loading them from the disk, resizing them, shuffling them, normalizing them, batch them and feed them to the model.
AUTOTUNE = tf.data.AUTOTUNE

# prefetch() prepares the next batch of data while the model is being trained on the current one
# shuffle() is used because it randomizes the order of images so that the machine is not focusing on a specific order during training
# cache() places the data in cache memory because it speeds up the training process by avoiding the need to read data from disk repeatedly
# prefetch(buffer_size=AUTOTUNE) decides how many batches of prefetched data should be prepared ahead of time (in RAM) to keep the training pipeline smooth.

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
