# **Data Modelling and Evaluation**

---

## Objectives

* Answer business requirement 2: 
    * The client seeks to predict whether a cherry leaf is healthy or infected with powdery mildew.

## Inputs

* inputs/cherry_leaves_dataset/cherry-leaves/train
* inputs/cherry_leaves_dataset/cherry-leaves/test
* inputs/cherry_leaves_dataset/cherry-leaves/validation
* image shape embeddings

## Outputs

* Images distribution plot in train, validation, and test set
* Image augmentation
* Class indices to change prediction inference in labels
* Machine learning model creation and training
* Save model
* Learning curve plot for model performance
* Model evaluation on pickle file
* Prediction on the random image file





## Additional Comments:

N/A


---

# Set Data Directory

---

## Import libraries

In [101]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread
import cv2 
import joblib
import random



## Set working directory

In [102]:
cwd= os.getcwd()

In [103]:
os.chdir('/workspace/Portfolio_5_Cherry_Leaves_Mildew')
print("You set a new current directory")

You set a new current directory


In [104]:
work_dir = os.getcwd()
work_dir

'/workspace/Portfolio_5_Cherry_Leaves_Mildew'

## Set input directories

Set train, validation and test paths.

In [105]:
my_data_dir = 'inputs/cherry_leaves_dataset/cherry-leaves'
train_path = my_data_dir + '/train'
val_path = my_data_dir + '/validation'
test_path = my_data_dir + '/test'
print(train_path, val_path, test_path)

inputs/cherry_leaves_dataset/cherry-leaves/train inputs/cherry_leaves_dataset/cherry-leaves/validation inputs/cherry_leaves_dataset/cherry-leaves/test


## Set output directory

In [106]:
version = 'v1'
file_path = f'outputs/{version}'

if 'outputs' in os.listdir(current_dir) and version in os.listdir(current_dir + '/outputs'):
    print('Old version is already available create a new version.')
    pass
else:
    os.makedirs(name=file_path)

Old version is already available create a new version.


### Set label names

In [107]:
# Set labels
labels = os.listdir(train_path)
print('Label for the images are', labels)

Label for the images are ['healthy', 'powdery_mildew']


### Set image shape

In [108]:
## Import saved image shape embedding
version = 'v1'
image_shape = joblib.load(filename=f"outputs/{version}/image_shape.pkl")
image_shape

(50, 50)

---

## Number of images in the train, test, and validation data

---

In [109]:
df_freq = pd.DataFrame([])
for folder in ['train', 'validation', 'test']:
    for label in labels:
        df_freq = df_freq.append(
            pd.Series(data={'Set': folder,
                            'Label': label,
                            'Frequency': int(len(os.listdir(my_data_dir + '/' + folder + '/' + label)))}
                      ),
            ignore_index=True
        )

        print(f"* {folder} - {label}: {len(os.listdir(my_data_dir + '/' + folder + '/' + label))} images")

print("\n")
sns.set_style("whitegrid")
plt.figure(figsize=(8, 5))
sns.barplot(data=df_freq, x='Set', y='Frequency', hue='Label')
plt.savefig(f'{file_path}/labels_distribution.png', bbox_inches='tight', dpi=150)
plt.show()


AttributeError: 'DataFrame' object has no attribute 'append'

---

## Image data augmentation

---

### ImageDataGenerator

In [111]:
def augment_image(image):
    # Randomly apply rotation
    angle = random.randint(-20, 20)
    rows, cols, _ = image.shape
    M = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1)
    image = cv2.warpAffine(image, M, (cols, rows))

    # Randomly apply translation
    x_translation = random.randint(-10, 10)
    y_translation = random.randint(-10, 10)
    M = np.float32([[1, 0, x_translation], [0, 1, y_translation]])
    image = cv2.warpAffine(image, M, (cols, rows))

    # Randomly apply shearing
    shear_factor = random.uniform(-0.1, 0.1)
    shear_matrix = np.array([[1, shear_factor, 0], [0, 1, 0]])
    image = cv2.warpAffine(image, shear_matrix, (cols, rows))

    # Randomly apply zoom
    zoom_factor = random.uniform(0.9, 1.1)
    zoom_matrix = np.array([[zoom_factor, 0, 0], [0, zoom_factor, 0]])
    image = cv2.warpAffine(image, zoom_matrix, (cols, rows))

    # Randomly flip horizontally
    if random.random() > 0.5:
        image = cv2.flip(image, 1)

    return image

# Example usage:
original_image = cv2.imread("path_to_image.jpg")  # Replace with your image path
augmented_image = augment_image(original_image)

# Display the original and augmented images (for visualization purposes)
cv2.imshow("Original Image", original_image)
cv2.imshow("Augmented Image", augmented_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

[ WARN:0@2314.720] global loadsave.cpp:248 findDecoder imread_('path_to_image.jpg'): can't open/read file: check file path/integrity


AttributeError: 'NoneType' object has no attribute 'shape'

### Plot augmented training image

In [None]:
for i in range(3):
    img = train_images[i]
    plt.imshow(img)
    plt.show()