# Data-Generator Class 

So far we have been working on training the Siamese network. 
we encouter the problem that loading all the images on memory 
ruins our notebook environment. since we have a large 
dataset that has more than 4 million images we have to create 
a process to load the images by means of data so, we 
can save memory and so on. 

we cannot exceed the quota of 32Gb in the memory of the 
GPU. so we are going to use a method for loading 
dynamically by batch the images. 



In [1]:
import pandas as pd
import numpy as np
import os
import cv2
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img

2023-08-25 10:40:51.578229: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer.so.6
2023-08-25 10:40:51.579809: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer_plugin.so.6


In [4]:
# Here we have an example of a data generator using triplets
# wee need to adapt it to our code for not loading all the images into memory 
#so we can use the GPU, we need to balance the memory of the GPU 
# we only have 32 GB of memory on the GPU. 
# only the model usese 9GB of space in the memory, so lets explore this solution 



import pandas as pd
import numpy as np
import os
import cv2
from tensorflow.keras.utils import Sequence

class TripletDataGenerator(Sequence):

    def __init__(self, csv_file, output_size, shuffle=False, batch_size=10):
        #we initialize the class with some attributes
        #we can ommit the base_dir since our triplets have the real path 
        self.df = pd.read_csv(csv_file)
        #self.base_dir = base_dir
        self.output_size = output_size
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.on_epoch_end()

    def on_epoch_end(self):
        self.indices = np.arange(len(self.df))
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __len__(self):
        return int(len(self.df) / self.batch_size)
    
    #we have our custom load image so, we will avoid the 
    #use of cv2 

    def load_image(self, img_path):
        #path = "/content/drive/MyDrive/C-Minds phase 3/data/"
        imsize = 224
        image = load_img(img_path)
        image = img_to_array(image)

        return image/ 255.0  # Normalize to [0, 1]

    def __getitem__(self, idx):
        #getting the image, here we have to  be carefull to understand 
        #what is happenning inside the class and the function. 
        
        X_anchor = np.empty((self.batch_size, *self.output_size, 3))
        X_positive = np.empty((self.batch_size, *self.output_size, 3))
        X_negative = np.empty((self.batch_size, *self.output_size, 3))
        
        #X_anchor = []
        #X_positive = []
        #X_negative = []

        indices = self.indices[idx * self.batch_size: (idx + 1) * self.batch_size]

        for i, data_index in enumerate(indices):
            anchor_path = self.df.iloc[data_index, 1]
            positive_path = self.df.iloc[data_index, 2]
            negative_path = self.df.iloc[data_index, 3]
            
            #to see how the class works lets ouput the path of the image 
            X_anchor[i] = self.load_image(anchor_path)
            X_positive[i] = self.load_image(positive_path)
            X_negative[i] = self.load_image(negative_path)
            
        #change to zero the ones 
        return [X_anchor, X_positive, X_negative],np.ones(len(X_anchor))  # No labels for triplets

# Example usage
csv_file = "csvs/pro_big_triplets.csv"
#base_dir = "path_to_images_directory"
output_size = (224, 224)  # Adjust as needed
batch_size = 3000

data_generator = TripletDataGenerator(csv_file, output_size, shuffle=True, batch_size=batch_size)
batch_inputs, _ = data_generator[0]  # Get a batch of triplet inputs

TypeError: 'tuple' object is not callable

In [14]:
len(batch_inputs[0])

10

In [19]:
batch_inputs, _ = data_generator[0]
len(batch_inputs[0])

10

In [23]:
batch_inputs[0][3]

'crop_pro_nat/JM015 Monotas/JM015 Monotas_original_elzapotal_ZAP06B06_2013_R5V_Panthera onca.JPG_9f1e4c3e-a8b3-4087-9538-b434028b1af0.JPG_left_top.jpg'

In [24]:
batch_inputs[1][3]

'crop_pro_nat/JM015 Monotas/elzapotal_ZAP06CUDE06_2013_R5_441 JM-12.JPG_cropped_img.jpg'

In [25]:
batch_inputs[2][3]

'crop_pro_nat/JH010 Sarabi/JH010 Sarabi_original_439851.668_2363207.75_2016 (1).JPG_83acd3c8-5a3e-4906-ad94-88e96a2772de.JPG_left_bottom.jpg'