In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import splitfolders

#(The following code need not be run again so it's commented out)
splitfolders.ratio("data/", output="split-data/",
                   seed = 42, ratio=(0.8, 0.15, 0.05))

Copying files: 13024 files [00:08, 1586.72 files/s]


In [3]:
#Renaming filenames

import os
directory = "split-data/train/"

'''
##debugging file access (getting comfortable with 'os' and file handling in general)

for folder in os.listdir(directory): #'listdir' returns a list of all files/directories in the specified
    print(" ")
    count = 0
    if not os.path.isdir(folder): #'isdir' returns a boolean value: True if the current object is a directory, False otherwise
                              #this is to combat 'listdir' listing hidden files (such as .DS_Store on MacOS), causing errors
        continue
        
    folder_path = os.path.join(directory, folder) #'join' concatenates the first/second argument (creating a new path)
    print(folder_path)
    for filename in os.listdir(folder_path):
        old_path = os.path.join(folder_path, filename)
        print(old_path)
        count+=1
        if count == 3:
            break
'''


##Actually renaming the files:

for folder in os.listdir(directory):
    folder_path = os.path.join(directory, folder)
    if not os.path.isdir(folder_path):
        continue
        
    count=1
    for filename in os.listdir(folder_path):
        old_path = os.path.join(folder_path, filename)
        if not os.path.isfile(old_path):
            continue
        extension = os.path.splitext(filename)[1] #retrieves extension by splitting on the filename
        new_name = f"{folder}_{count:04d}{extension}"
        new_path = os.path.join(folder_path, new_name)
        
        os.rename(old_path, new_path)
        #print(f"Renamed {old_path} to {new_path}")
        count+=1

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [16]:
#applying transformations (not finalized)

train_datagen = ImageDataGenerator(rescale = 1.0/255.0,
                                   shear_range = 0.1,
                                   rotation_range = 10,
                                   brightness_range = [0.8,1.2],
                                   horizontal_flip = True)

train_set = train_datagen.flow_from_directory(directory,
                                              target_size = (224,224),
                                              color_mode = "rgb",
                                              class_mode = 'categorical',
                                              batch_size = 32,
                                              shuffle = True, #we don't want the model to memorize the order of the images
                                              seed = 42)

validation_datagen = ImageDataGenerator(rescale=1.0/255.0)

validation_set = validation_datagen.flow_from_directory("split-data/val",
                                                        target_size=(224, 224),
                                                        batch_size=32,
                                                        class_mode='categorical',
                                                        shuffle=False)

test_datagen = ImageDataGenerator(rescale=1.0/255.0)

test_set = test_datagen.flow_from_directory('split-data/test',
                                            target_size=(224, 224),
                                            batch_size=32,
                                            class_mode='categorical',
                                            shuffle=False)

Found 10414 images belonging to 14 classes.
Found 1947 images belonging to 14 classes.
Found 663 images belonging to 14 classes.


In [13]:
# checking for corrupt files
from PIL import Image
count = 0
for folder_name in os.listdir(directory):
    folder_path = os.path.join(directory, folder_name)
    if not os.path.isdir(folder_path):
        continue
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if not os.path.isdir(file_path):
            break
        try:
            im = Image.open(file_path)
            im.verify()
            im.close()
        except (IOError, OSError, Image.DecompressionBombError):
            print(f"Fail: {filename}")
            count+=1

print(count, "corrputed files")

0 corrputed files


No corrupt files

In [15]:
# exploring / handling class imbalances
for folder_name in os.listdir(directory):
    count = 0
    folder_path = os.path.join(directory, folder_name)
    if not os.path.isdir(folder_path):
        continue
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        count+=1
    print(folder_name, ": ", count)

Potato_Late_Blight :  800
Corn_Healthy :  929
Wheat_Healthy :  892
Corn_Common_Rust :  954
Rice_Neck_Blast :  800
Potato_Early_Blight :  800
Rice_Leaf_Blast :  781
Wheat_Yellow_Rust :  740
Potato_Healthy :  121
Rice_Healthy :  1191
Corn_Gray_Leaf_Spot :  410
Wheat_Brown_Rust :  721
Corn_Northern_Leaf_Blight :  788
Rice_Brown_Spot :  490


In [None]:
#we can aim to have 700-1000 images in each class

'''
Classes to be oversampled:
- Potato Healthy: 121
- Corn Gray Leaf Spot: 410
- Rice Brown Spot: 490

'Rice Healthy' may be undersampled to 900-1000 observations
'''