# Analysing The Data
* Visualising the different types of data

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
import os
import sys
import time
import re
import datetime as dt

import warnings
warnings.filterwarnings("ignore")

import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, PReLU
from keras.optimizers import SGD, RMSprop, Adam
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.preprocessing import image
from keras import regularizers

import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from mpl_toolkits.axes_grid1 import ImageGrid

plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 16

sns.set_palette('muted',color_codes=True)
sns.set_context('notebook', font_scale=1.4)
vc_color = '#B5C9EB'

2024-04-10 08:35:50.331512: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-10 08:35:50.331642: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-10 08:35:50.488979: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Path variables
data_dir = "/kaggle/input/plantvillage-dataset/color"

# Get list of all classes
classes = os.listdir(data_dir)
set(classes)

{'Apple___Apple_scab',
 'Apple___Black_rot',
 'Apple___Cedar_apple_rust',
 'Apple___healthy',
 'Blueberry___healthy',
 'Cherry_(including_sour)___Powdery_mildew',
 'Cherry_(including_sour)___healthy',
 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot',
 'Corn_(maize)___Common_rust_',
 'Corn_(maize)___Northern_Leaf_Blight',
 'Corn_(maize)___healthy',
 'Grape___Black_rot',
 'Grape___Esca_(Black_Measles)',
 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)',
 'Grape___healthy',
 'Orange___Haunglongbing_(Citrus_greening)',
 'Peach___Bacterial_spot',
 'Peach___healthy',
 'Pepper,_bell___Bacterial_spot',
 'Pepper,_bell___healthy',
 'Potato___Early_blight',
 'Potato___Late_blight',
 'Potato___healthy',
 'Raspberry___healthy',
 'Soybean___healthy',
 'Squash___Powdery_mildew',
 'Strawberry___Leaf_scorch',
 'Strawberry___healthy',
 'Tomato___Bacterial_spot',
 'Tomato___Early_blight',
 'Tomato___Late_blight',
 'Tomato___Leaf_Mold',
 'Tomato___Septoria_leaf_spot',
 'Tomato___Spider_mites Two-spotted_

In [5]:
# List of species
species = set()
for c in classes:
    s = re.findall(".+?___", c)[0][:-3]
    species.add(s)
species

{'Apple',
 'Blueberry',
 'Cherry_(including_sour)',
 'Corn_(maize)',
 'Grape',
 'Orange',
 'Peach',
 'Pepper,_bell',
 'Potato',
 'Raspberry',
 'Soybean',
 'Squash',
 'Strawberry',
 'Tomato'}

In [6]:
# List of diseases
diseases = set()
for c in classes:
    d = re.findall("___.+", c)[0][3:]
    diseases.add(d)
diseases

{'Apple_scab',
 'Bacterial_spot',
 'Black_rot',
 'Cedar_apple_rust',
 'Cercospora_leaf_spot Gray_leaf_spot',
 'Common_rust_',
 'Early_blight',
 'Esca_(Black_Measles)',
 'Haunglongbing_(Citrus_greening)',
 'Late_blight',
 'Leaf_Mold',
 'Leaf_blight_(Isariopsis_Leaf_Spot)',
 'Leaf_scorch',
 'Northern_Leaf_Blight',
 'Powdery_mildew',
 'Septoria_leaf_spot',
 'Spider_mites Two-spotted_spider_mite',
 'Target_Spot',
 'Tomato_Yellow_Leaf_Curl_Virus',
 'Tomato_mosaic_virus',
 'healthy'}

In [7]:
# Look at specie, disease combinations
combined = []
for d in diseases:
    for s in species:
        name = s + "___" + d
        if name in classes:
            out = (d, s, 1)
        else:
            out = (d, s, 0)
        combined.append(out)
combined = pd.DataFrame(combined)
combined.columns = ['disease', 'specie', 'flag']
combined.pivot_table(values='flag', index='disease', columns='specie')

specie,Apple,Blueberry,Cherry_(including_sour),Corn_(maize),Grape,Orange,Peach,"Pepper,_bell",Potato,Raspberry,Soybean,Squash,Strawberry,Tomato
disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Apple_scab,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bacterial_spot,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
Black_rot,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cedar_apple_rust,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cercospora_leaf_spot Gray_leaf_spot,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Common_rust_,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Early_blight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
Esca_(Black_Measles),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Haunglongbing_(Citrus_greening),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Late_blight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [8]:
# Number of images in each class
image_counts = {}
for c in classes:
    path = data_dir +'/' + c
    count = len(os.listdir(path))
    image_counts[c] = count
p = pd.DataFrame.from_dict(image_counts, orient='index')
p

Unnamed: 0,0
Tomato___Late_blight,1909
Tomato___healthy,1591
Grape___healthy,423
Orange___Haunglongbing_(Citrus_greening),5507
Soybean___healthy,5090
Squash___Powdery_mildew,1835
Potato___healthy,152
Corn_(maize)___Northern_Leaf_Blight,985
Tomato___Early_blight,1000
Tomato___Septoria_leaf_spot,1771


In [9]:
p[1].sum()

KeyError: 1

In [None]:
# Look at specie, disease combinations
combined = []
for d in diseases:
    for s in species:
        name = s + "___" + d
        if name in classes:
            path = data_dir + '/' + name
            count = len(os.listdir(path))
            out = (d, s, count)
        else:
            out = (d, s, 0)
        combined.append(out)
combined = pd.DataFrame(combined)
combined.columns = ['disease', 'specie', 'flag']
hm = combined.pivot_table(values='flag', index='disease', columns='specie')

plt.figure(figsize=(12, 8))
msk = hm == 0
sns.heatmap(hm, cmap="YlGnBu", mask=msk);
plt.tight_layout()
plt.savefig("species_disease_heatmap.png")

In [None]:
print(f"Total number of images: {sum(image_counts.values())}")

In [None]:
# Get path and label for each training image
db=[]
for label, class_name in enumerate(classes):
    path = data_dir + '/' + class_name
    for file in os.listdir(path):
        if not '.ini' in file:
            db.append(['{}/{}'.format(class_name, file), label, class_name])
db = pd.DataFrame(db, columns=['file', 'label', 'class_name'])

In [None]:
def read_img(filepath, size):
    img = image.load_img(data_dir + '/' + filepath, target_size=size)
    img = image.img_to_array(img)
    return img

def format_name(s):
    return re.sub('_+', ' ', s)

In [None]:
# Plot some images (fixed dimensions)
num_classes = len(classes)
fig = plt.figure(1, figsize=(10, 40))
grid = ImageGrid(fig, 111, nrows_ncols=(num_classes, 10), axes_pad=0.05)

i = 0
for label, class_name in enumerate(classes):
    for filepath in db[db['class_name'] == class_name]['file'].values[:10]:
        ax = grid[i]
        img = read_img(filepath, (256, 256))
        ax.imshow(img / 255.)
        ax.axis('off')
        if i % 10 == 10 - 1:
            name = format_name(filepath.split('/')[0])
            ax.text(260, 112, name , verticalalignment='center')
        i += 1
        
plt.show();

In [None]:
# Plot image from each class
fig = plt.figure(1, figsize=(15, 10))
grid = ImageGrid(fig, 111, nrows_ncols=(6, 7), axes_pad=0.05)

for i in range(42):
    ax = grid[i]
    ax.axis('off')
    if i < len(classes):
        class_name = classes[i]
        for filepath in db[db['class_name'] == class_name]['file'].values[:1]:
            img = read_img(filepath, (224, 224))
            ax.imshow(img / 255.)
            ax.annotate(i+1, xy=(10,25), color="white", fontsize=12, fontweight='bold')
plt.tight_layout();
plt.savefig("/kaggle/working/image_per_specie.png")

In [None]:
data_dir = "/kaggle/input/plantvillage-dataset/"

fig = plt.figure(1, figsize=(6, 6))
grid = ImageGrid(fig, 111, nrows_ncols=(3, 3), axes_pad=0.05)

np.random.seed(33)
rnd = np.random.randint(0, len(db), 3)
image_types = ["color", "grayscale", "segmented"]
i = 0
for r in rnd:
    file = db.file[r]
    for t in image_types:
        filepath = data_dir + t + '/' + file
        if t == "segmented":
            filepath = filepath.replace(".JPG", "") + "_final_masked.jpg"
            
        ax = grid[i]
        img = image.load_img(filepath, target_size=(256, 256))
        img = image.img_to_array(img)
        ax.imshow(img / 255.)
        ax.axis('off')
        i += 1
plt.tight_layout()
plt.savefig("image_types.png")