# **Preparing data to train the models**

Objectives: Preprocess the cherry leaves dataset for model training.

Inputs: Cherry leaves dataset downloaded from Kaggle.

Outputs: Preprocessed dataset ready for model training.

# Steps for preparing the data:


## 1. Install prerequisites

In [10]:
%pip install -r /workspaces/Mildew-Detection-in-Cherry-Leaves/requirements.txt


Note: you may need to restart the kernel to use updated packages.
requirements installed


## 2. Import libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import altair as alt
import zipfile
import os
print("requirements installed")

requirements installed


## 3. Unzip files

In [3]:
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

NameError: name 'DestinationFolder' is not defined

## 4. Merge and Group Images by Category

In [5]:
def merge_and_group_images(source_folders, destination_folders):
    # Asegurarse de que los directorios de destino existan
    for _, dest in destination_folders.items():
        if not os.path.exists(dest):
            os.makedirs(dest)

    for folder in source_folders:
        for category in ['healthy', 'powdery_mildew']:
            source_path = folder + '/' + category
            files = os.listdir(source_path)
            for file in files:
                shutil.move(source_path + '/' + file, destination_folders[category])
            os.rmdir(source_path)

### Define source folders

In [6]:
source_folders = [
    "inputs/mildew_dataset/cherry-leaves/test",
    "inputs/mildew_dataset/cherry-leaves/train"
]

### Define destination folders

In [7]:
destination_folders = {
    "healthy": "inputs/mildew_dataset/cherry-leaves/healthy",
    "powdery_mildew": "inputs/mildew_dataset/cherry-leaves/powdery_mildew"
}

### Execute the function

In [14]:
import os
import shutil

def remove_initial_directories(directories):
    for directory in directories:
        shutil.rmtree(directory)

# Define the directories to be removed
directories_to_remove = [
    "inputs/mildew_dataset/cherry-leaves/test",
    "inputs/mildew_dataset/cherry-leaves/train"
]

# Execute the function
remove_initial_directories(directories_to_remove)

## 5. Create sinthetic files to ensure both healthy and powdery leaves contain the same amount

### Count Images in Each Folder:

In [20]:
import os

healthy_images_count = len(os.listdir('/workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves/healthy'))
mildew_images_count = len(os.listdir('/workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves/powdery_mildew'))
print(healthy_images_count)
print(mildew_images_count)


2173
2028


### Determine the Difference and Set Image Generation Count:

In [24]:
images_to_generate = abs(healthy_images_count - mildew_images_count)
folder_to_use = '/workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves/powdery_mildew' if healthy_images_count > mildew_images_count else '/workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves/healthy'
print(images_to_generate)

145


### Define Image Transformation Functions:

In [25]:
from PIL import Image, ImageOps
import random

def random_transformation(image_path):
    image = Image.open(image_path)
    choice = random.randint(1, 3)
    if choice == 1:
        # Mirror
        return ImageOps.mirror(image)
    elif choice == 2:
        # Adjust brightness
        return ImageEnhance.Brightness(image).enhance(random.uniform(0.5, 1.5))
    else:
        # Rotate
        return image.rotate(random.choice([90, 180, 270]))

### Generate and Save Transformed Images:

In [27]:
from PIL import ImageEnhance

for i in range(images_to_generate):
    random_image = random.choice(os.listdir(folder_to_use))
    transformed_image = random_transformation(os.path.join(folder_to_use, random_image))
    save_path = f'{folder_to_use}/synthetic_{i}.jpg'
    transformed_image.save(save_path)

    print(f"Processing image number: {i}")
    print(f"Selected random image for transformation: {random_image}")
    print(f"Saving transformed image to: {save_path}")






Processing image number: 0
Selected random image for transformation: powdery_mildew (624).JPG
Saving transformed image to: /workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves/powdery_mildew/synthetic_0.jpg
Processing image number: 1
Selected random image for transformation: powdery_mildew (679).JPG
Saving transformed image to: /workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves/powdery_mildew/synthetic_1.jpg
Processing image number: 2
Selected random image for transformation: powdery_mildew (467).JPG
Saving transformed image to: /workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves/powdery_mildew/synthetic_2.jpg
Processing image number: 3
Selected random image for transformation: powdery_mildew (1963).JPG
Saving transformed image to: /workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves/powdery_mildew/synthetic_3.jpg
Processing image number: 4
Selected random image for tr

## 6. Scale down the sample images, to reduce cpu needed.

In this step, we will resize all images in both 'healthy' and 'mildew' folders to a smaller size. This reduces the computational load for future processing.


### Code Cell - Import Required Libraries

In [28]:
from PIL import Image
import os
import glob


### Code Cell - Define Resizing Function:

In [37]:
def resize_images(folder_path, output_size=(50, 50)):
    for img_path in glob.glob(folder_path + '/*.jpg'):
        img = Image.open(img_path)
        img = img.resize(output_size, Image.Resampling.LANCZOS)
        img.save(img_path)

### Resize images in both folders

In [38]:
from PIL import Image, ImageOps

resize_images(healthy_folder_path)
resize_images(mildew_folder_path)

## 7. Splitting Data into Training, Validation, and Test Sets

Now, we will divide our images into training, validation, and test sets. This helps in model training, tuning, and evaluation.

### Code Cell - Import Required Libraries:

In [48]:
import os
import shutil
import random

### Code Cell - Define Splitting Function:

In [49]:
def split_images_into_sets(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("Sum of ratios must equal 1.0")
        return

    labels = os.listdir(my_data_dir)
    for label in labels:

        for set_type in ['train', 'validation', 'test']:
            os.makedirs(os.path.join(my_data_dir, set_type, label), exist_ok=True)
        
        files = os.listdir(os.path.join(my_data_dir, label))
        random.shuffle(files)

        train_set_files_qty = int(len(files) * train_set_ratio)
        validation_set_files_qty = int(len(files) * validation_set_ratio)

        for i, file in enumerate(files):
            src_path = os.path.join(my_data_dir, label, file)
            if i < train_set_files_qty:
                dest_path = os.path.join(my_data_dir, 'train', label, file)
            elif i < train_set_files_qty + validation_set_files_qty:
                dest_path = os.path.join(my_data_dir, 'validation', label, file)
            else:
                dest_path = os.path.join(my_data_dir, 'test', label, file)
            
            shutil.move(src_path, dest_path)

Code Cell - Execute Function:

In [50]:
split_images_into_sets(
    my_data_dir='/workspaces/Mildew-Detection-in-Cherry-Leaves/inputs/mildew_dataset/cherry-leaves',
    train_set_ratio=0.7,
    validation_set_ratio=0.1,
    test_set_ratio=0.2
)


## 8. Image Labeling for Training

Labeling Process Overview
In this step, we'll label the cherry leaf images as 'healthy' or 'powdery mildew' based on their directory names. This labeling is crucial for the model to learn and differentiate between these two categories during the training process. The images are already segregated into distinct folders, simplifying this process. We'll ensure each image is associated with the correct label, forming the foundation for our CNN model training.

### Import libraries

In [8]:
from keras.preprocessing.image import ImageDataGenerator


### Define Parameters:



In [9]:
import os
os.chdir("/workspaces/Mildew-Detection-in-Cherry-Leaves")
print("Current working directory:", os.getcwd())
train_dir = 'inputs/mildew_dataset/cherry-leaves/train'
validation_dir = 'inputs/mildew_dataset/cherry-leaves/validation'
test_dir = 'inputs/mildew_dataset/cherry-leaves/test'

img_width, img_height = 50, 50  # Adjusted image size
batch_size = 20  # Adjusted batch size

Current working directory: /workspaces/Mildew-Detection-in-Cherry-Leaves


### Create ImageDataGenerators:

In [10]:
train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)



### Flow Images from Directories:

In [11]:
train_generator = train_datagen.flow_from_directory( train_dir, target_size=(img_width, img_height), batch_size=batch_size, class_mode='binary')

validation_generator = validation_datagen.flow_from_directory( validation_dir, target_size=(img_width, img_height), batch_size=batch_size, class_mode='binary')

test_generator = test_datagen.flow_from_directory( test_dir, target_size=(img_width, img_height), batch_size=batch_size, class_mode='binary')

Found 3042 images belonging to 2 classes.
Found 434 images belonging to 2 classes.
Found 870 images belonging to 2 classes.


In [12]:
import os

os.chdir("/workspaces/Mildew-Detection-in-Cherry-Leaves")
print("Current working directory:", os.getcwd())

Current working directory: /workspaces/Mildew-Detection-in-Cherry-Leaves


In [13]:
# Display class indices
print("Train Set Class Indices:", train_generator.class_indices)
print("Validation Set Class Indices:", validation_generator.class_indices)
print("Test Set Class Indices:", test_generator.class_indices)

# Optional: Display some image batches with their labels
for image_batch, label_batch in train_generator:
    print("Image batch shape:", image_batch.shape)
    print("Label batch shape:", label_batch.shape)
    break  # Display only the first batch


Train Set Class Indices: {'healthy': 0, 'powdery_mildew': 1}
Validation Set Class Indices: {'healthy': 0, 'powdery_mildew': 1}
Test Set Class Indices: {'healthy': 0, 'powdery_mildew': 1}
Image batch shape: (20, 50, 50, 3)
Label batch shape: (20,)
