# Organize data into folders

In [2]:
import os
import shutil
import pandas as pd
import numpy as np

In [3]:
# examine folders
base_dir = '../data'

In [4]:
# list of main data sources
os.listdir(base_dir)

['cassava-leaf-disease-classification',
 'plantvillage dataset',
 'rice_leaf_diseases']

In [5]:
for dir in os.listdir(base_dir):
    print(f'{dir}: {os.listdir(os.path.join(base_dir, dir))}')

cassava-leaf-disease-classification: ['label_num_to_disease_map.json', 'sample_submission.csv', 'test_images', 'test_tfrecords', 'train.csv', 'train_images', 'train_tfrecords']
plantvillage dataset: ['color', 'grayscale', 'segmented']
rice_leaf_diseases: ['_BrownSpot', '_Healthy', '_Hispa', '_LeafBlast']


In [6]:
cassava_dir = os.path.join(base_dir, 'cassava-leaf-disease-classification')
plant_village_dir = os.path.join(base_dir, 'plantvillage dataset')
rice_dir = os.path.join(base_dir, 'rice_leaf_diseases')

In [7]:
len(os.listdir(os.path.join(cassava_dir, 'train_images')))

21397

In [8]:
os.listdir(os.path.join(plant_village_dir, 'color'))

['Apple___Apple_scab',
 'Apple___Black_rot',
 'Apple___Cedar_apple_rust',
 'Apple___healthy',
 'Blueberry___healthy',
 'Cherry_(including_sour)___healthy',
 'Cherry_(including_sour)___Powdery_mildew',
 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot',
 'Corn_(maize)___Common_rust_',
 'Corn_(maize)___healthy',
 'Corn_(maize)___Northern_Leaf_Blight',
 'Grape___Black_rot',
 'Grape___Esca_(Black_Measles)',
 'Grape___healthy',
 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)',
 'Orange___Haunglongbing_(Citrus_greening)',
 'Peach___Bacterial_spot',
 'Peach___healthy',
 'Pepper,_bell___Bacterial_spot',
 'Pepper,_bell___healthy',
 'Potato___Early_blight',
 'Potato___healthy',
 'Potato___Late_blight',
 'Raspberry___healthy',
 'Soybean___healthy',
 'Squash___Powdery_mildew',
 'Strawberry___healthy',
 'Strawberry___Leaf_scorch',
 'Tomato___Bacterial_spot',
 'Tomato___Early_blight',
 'Tomato___healthy',
 'Tomato___Late_blight',
 'Tomato___Leaf_Mold',
 'Tomato___Septoria_leaf_spot',
 'Tomato___Spid

In [9]:
for dir in os.listdir(os.path.join(plant_village_dir, 'color')):
    print(f'{dir}: {len(os.listdir(os.path.join(plant_village_dir, "color", dir)))}')

Apple___Apple_scab: 630
Apple___Black_rot: 621
Apple___Cedar_apple_rust: 275
Apple___healthy: 1645
Blueberry___healthy: 1502
Cherry_(including_sour)___healthy: 854
Cherry_(including_sour)___Powdery_mildew: 1052
Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot: 513
Corn_(maize)___Common_rust_: 1192
Corn_(maize)___healthy: 1162
Corn_(maize)___Northern_Leaf_Blight: 985
Grape___Black_rot: 1180
Grape___Esca_(Black_Measles): 1383
Grape___healthy: 423
Grape___Leaf_blight_(Isariopsis_Leaf_Spot): 1076
Orange___Haunglongbing_(Citrus_greening): 5507
Peach___Bacterial_spot: 2297
Peach___healthy: 360
Pepper,_bell___Bacterial_spot: 997
Pepper,_bell___healthy: 1478
Potato___Early_blight: 1000
Potato___healthy: 152
Potato___Late_blight: 1000
Raspberry___healthy: 371
Soybean___healthy: 5090
Squash___Powdery_mildew: 1835
Strawberry___healthy: 456
Strawberry___Leaf_scorch: 1109
Tomato___Bacterial_spot: 2127
Tomato___Early_blight: 1000
Tomato___healthy: 1591
Tomato___Late_blight: 1909
Tomato___Leaf_Mold

In [10]:
for dir in os.listdir(rice_dir):
    print(f'{dir}: {len(os.listdir(os.path.join(rice_dir, dir)))}')

_BrownSpot: 523
_Healthy: 1488
_Hispa: 565
_LeafBlast: 779


In [34]:
fnames = []

for dir in os.listdir(os.path.join(plant_village_dir, 'color')):
    columns = dir.split('___')
    columns.append(dir)
    fnames.append(columns)

In [36]:
fnames = pd.DataFrame(fnames, columns=['Tree type', 'Disease', 'Folder'])
fnames

Unnamed: 0,Tree type,Disease,Folder
0,Apple,Apple_scab,Apple___Apple_scab
1,Apple,Black_rot,Apple___Black_rot
2,Apple,Cedar_apple_rust,Apple___Cedar_apple_rust
3,Apple,healthy,Apple___healthy
4,Blueberry,healthy,Blueberry___healthy
5,Cherry_(including_sour),healthy,Cherry_(including_sour)___healthy
6,Cherry_(including_sour),Powdery_mildew,Cherry_(including_sour)___Powdery_mildew
7,Corn_(maize),Cercospora_leaf_spot Gray_leaf_spot,Corn_(maize)___Cercospora_leaf_spot Gray_leaf_...
8,Corn_(maize),Common_rust_,Corn_(maize)___Common_rust_
9,Corn_(maize),healthy,Corn_(maize)___healthy


In [38]:
fnames['Disease'] = fnames['Disease'].str.replace('_', ' ')
fnames['Tree type'] = fnames['Tree type'].str.replace('_', ' ')
fnames

Unnamed: 0,Tree type,Disease,Folder
0,Apple,Apple scab,Apple___Apple_scab
1,Apple,Black rot,Apple___Black_rot
2,Apple,Cedar apple rust,Apple___Cedar_apple_rust
3,Apple,healthy,Apple___healthy
4,Blueberry,healthy,Blueberry___healthy
5,Cherry (including sour),healthy,Cherry_(including_sour)___healthy
6,Cherry (including sour),Powdery mildew,Cherry_(including_sour)___Powdery_mildew
7,Corn (maize),Cercospora leaf spot Gray leaf spot,Corn_(maize)___Cercospora_leaf_spot Gray_leaf_...
8,Corn (maize),Common rust,Corn_(maize)___Common_rust_
9,Corn (maize),healthy,Corn_(maize)___healthy


In [21]:
os.mkdir(os.path.join(base_dir, 'image data'))

In [22]:
os.mkdir(os.path.join(base_dir, 'image data', 'train'))
os.mkdir(os.path.join(base_dir, 'image data', 'validation'))
os.mkdir(os.path.join(base_dir, 'image data', 'test'))

In [23]:
train_path = os.path.join('../data/image data/train')
val_path = os.path.join('../data/image data/validation')
test_path = os.path.join('../data/image data/test')

In [40]:
for tree in fnames['Tree type'].unique().tolist():
    # create directory for tree
    path_1 = os.path.join(train_path, tree)
    path_2 = os.path.join(test_path, tree)
    path_3 = os.path.join(val_path, tree)

    os.mkdir(path_1)
    os.mkdir(path_2)
    os.mkdir(path_3)

    for disease in fnames[fnames['Tree type'] == tree]['Disease'].tolist():
        # create directory to each disease
        sub_path_1 = os.path.join(path_1, disease)
        sub_path_2 = os.path.join(path_2, disease)
        sub_path_3 = os.path.join(path_3, disease)

        os.mkdir(sub_path_1)
        os.mkdir(sub_path_2)
        os.mkdir(sub_path_3)