# **Preparing data to train the models**

Objectives: Preprocess the cherry leaves dataset for model training.

Inputs: Cherry leaves dataset downloaded from Kaggle.

Outputs: Preprocessed dataset ready for model training.

# Steps for preparing the data:


## 1. Install prerequisites

In [None]:
%pip install -r /workspaces/Mildew-Detection-in-Cherry-Leaves/requirements.txt

## 2. Import libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import tensorflow as tf
from tensorflow import keras
import altair as alt
import zipfile

'/workspaces/Mildew-Detection-in-Cherry-Leaves/jupyter_notebooks'

## 3. Unzip files

In [None]:
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

You set a new current directory


## 4. Merge and Group Images by Category

In [None]:
def merge_and_group_images(source_folders, destination_folders):
    for folder in source_folders:
        for category in ['healthy', 'powdery_mildew']:
            source_path = folder + '/' + category
            files = os.listdir(source_path)
            for file in files:
                shutil.move(source_path + '/' + file, destination_folders[category])
            os.rmdir(source_path)

### Define source folders

In [None]:
source_folders = [
    "inputs/mildew_dataset/cherry-leaves/test",
    "inputs/mildew_dataset/cherry-leaves/train"
]

### Define destination folders

In [None]:
destination_folders = {
    "healthy": "inputs/mildew_dataset/cherry-leaves/healthy",
    "powdery_mildew": "inputs/mildew_dataset/cherry-leaves/powdery_mildew"
}

### Execute the function

In [None]:
merge_and_group_images(source_folders, destination_folders)

## 5. Create sinthetic files to ensure both healthy and powdery leaves contain the same amount

### Count Images in Each Folder:

In [None]:
import os

healthy_images_count = len(os.listdir('path_to_healthy_folder'))
mildew_images_count = len(os.listdir('path_to_mildew_folder'))

### Determine the Difference and Set Image Generation Count:

In [None]:
images_to_generate = abs(healthy_images_count - mildew_images_count)
folder_to_use = 'path_to_mildew_folder' if healthy_images_count > mildew_images_count else 'path_to_healthy_folder'


### Define Image Transformation Functions:

In [None]:
from PIL import Image, ImageOps
import random

def random_transformation(image_path):
    image = Image.open(image_path)
    choice = random.randint(1, 3)
    if choice == 1:
        # Mirror
        return ImageOps.mirror(image)
    elif choice == 2:
        # Adjust brightness
        return ImageEnhance.Brightness(image).enhance(random.uniform(0.5, 1.5))
    else:
        # Rotate
        return image.rotate(random.choice([90, 180, 270]))

### Generate and Save Transformed Images:

In [None]:
for i in range(images_to_generate):
    random_image_file = random.choice(os.listdir(folder_to_use))
    transformed_image = random_transformation(os.path.join(folder_to_use, random_image_file))
    transformed_image.save(f'path_to_save_transformed_images/transformed_{i}.jpg')


## 6. Scale down the sample images, to reduce cpu needed.

In this step, we will resize all images in both 'healthy' and 'mildew' folders to a smaller size. This reduces the computational load for future processing.


### Code Cell - Import Required Libraries

In [None]:
from PIL import Image
import os
import glob


### Code Cell - Define Resizing Function:

In [None]:
def resize_images(folder_path, output_size=(50, 50)):
    for img_path in glob.glob(folder_path + '/*.jpg'):
        img = Image.open(img_path)
        img = img.resize(output_size, Image.ANTIALIAS)
        img.save(img_path)

### Resize images in both folders

In [None]:
resize_images(healthy_folder_path)
resize_images(mildew_folder_path)

## 7. Splitting Data into Training, Validation, and Test Sets

Now, we will divide our images into training, validation, and test sets. This helps in model training, tuning, and evaluation.

### Code Cell - Import Required Libraries:

In [None]:
import os
import shutil
import random

### Code Cell - Define Splitting Function:

In [None]:
def split_images_into_sets(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("Sum of ratios must equal 1.0")
        return

    labels = os.listdir(my_data_dir)
    for label in labels:
        files = os.listdir(os.path.join(my_data_dir, label))
        random.shuffle(files)

        train_set_files_qty = int(len(files) * train_set_ratio)
        validation_set_files_qty = int(len(files) * validation_set_ratio)

        for i, file in enumerate(files):
            src_path = os.path.join(my_data_dir, label, file)
            if i < train_set_files_qty:
                dest_path = os.path.join(my_data_dir, 'train', label, file)
            elif i < train_set_files_qty + validation_set_files_qty:
                dest_path = os.path.join(my_data_dir, 'validation', label, file)
            else:
                dest_path = os.path.join(my_data_dir, 'test', label, file)
            
            shutil.move(src_path, dest_path)

Code Cell - Execute Function:

In [None]:
split_images_into_sets(
    my_data_dir='path_to_data_directory',
    train_set_ratio=0.7,
    validation_set_ratio=0.1,
    test_set_ratio=0.2
)
