After concluding Phase 1, we continue with Phase 2: Training

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import Tuple
from keras.utils import load_img
import keras.utils as image
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from data_loader import Dataloader
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import random
from sklearn.utils import shuffle

tf.random.set_seed(222)

random.seed(222)

# 2. Training

First, we split our data into training, validation and test set.
Similar to the paper with the highest number of upvotes on paperswithcode [4], we split our data into 90% training, 5% validation and 5% test set [5]. As we found out before, we need to be aware of the class imbalance in the dataset. However, according to [5], we apply our data split for each of the classes separately, i.e. for each class, 90% of the samples are used for training and 5% for validation and testing, respectively. <br>
Since ~50k images is a whole lot, we also create a smaller train test validation split with 100 samples per class.

In [2]:
import pandas as pd

data = pd.read_csv("data.csv")
data = data.assign(label=data["plant"] + data["disease"])
display(data["label"].nunique())

train_small = pd.DataFrame()
test_small = pd.DataFrame()
validation_small = pd.DataFrame()

train_small = pd.DataFrame()
train = pd.DataFrame()
test_small = pd.DataFrame()
test = pd.DataFrame()
validation_small = pd.DataFrame()
validation = pd.DataFrame()
for label in data["label"].unique():
    data_for_label = data[data["label"] == label].reset_index(drop=True)

    # small
    train_current_small = data_for_label[:90]
    validation_current_small = data_for_label[90:95]
    test_current_small = data_for_label[95:100]

    # all data
    train_current_label, rest = train_test_split(
        data_for_label, test_size=0.1
    )
    test_current_label, validation_current_label = train_test_split(
        rest, test_size=0.5
    )

    train = pd.concat([train, train_current_label])
    validation = pd.concat([validation, validation_current_label])
    test = pd.concat([test, test_current_label])
    train_small = pd.concat([train_small, train_current_small])
    validation_small = pd.concat([validation_small, validation_current_small])
    test_small = pd.concat([test_small, test_current_small])
print(
    "Sanity check for small: ",
    len(train_small) + len(test_small) + len(validation_small)
    == data["label"].nunique() * 100,
)

train = shuffle(train)
test = shuffle(test)
validation = shuffle(validation)
train_small = shuffle(train_small)
test_small = shuffle(test_small)
validation_small = shuffle(validation_small)
print("Number of samples: ", len(data))
print("Number of training samples: ", len(train))
print("Number of test samples: ", len(test))
print("Number of validation: ", len(validation))
print("Sanity check: ", len(train) + len(test) + len(validation) == len(data))

38

Sanity check for small:  True
Number of samples:  54281
Number of training samples:  48836
Number of test samples:  2716
Number of validation:  2729
Sanity check:  True


<details>
    <summary>
    GPT
    </summary>
    how do I rewrite this code to prevent from the SettingWithCopyWarning:
    Code: """
    data["label"] = data["plant"] + data["disease"]
    """
</details>

As a sanity check, we train a random forest classifier and see whether it performs better than random guessing. Since we have 38 different classes, random guessing should achieve an accuracy of roughly 1/38 (~3%) on our balanced small datasets.

In [3]:
def load_and_label_data(data: pd.DataFrame) -> Tuple:
    X, y = [], []
    for _, row in data.iterrows():
        # based on the code from the exercise
        img = load_img(row["path"], target_size=(256, 256, 3))
        img = image.img_to_array(img)
        img = img / 255
        X.append(img)
        y.append(row["label"])
    return X, y


In [4]:
X_train, y_train = load_and_label_data(train_small)
X_validation, y_validation = load_and_label_data(validation_small)
X_test, y_test = load_and_label_data(test_small)

# as discussed in the exercise, we need to flatten the data for "regular machine learning"
X_train_flat = [element.flatten() for element in X_train]
X_val_flat = [element.flatten() for element in X_validation]

model = RandomForestClassifier(random_state=42)

model.fit(X_train_flat, y_train)
predictions = model.predict(X_val_flat)

print(accuracy_score(y_validation, predictions))

# release memory
del X_train_flat, X_val_flat, X_train, y_train, X_test, y_test

KeyboardInterrupt: 

An accuracy of ~0.52 appears reasonable. Next up, we try and train a convolutional neural network. The following code is mostly based on what we learned during the exercise. The goal is to get a first impression on how well convolutional neural networks perform on the given dataset. To achieve this, we need to onehot encode the labels first. Then, we can define a model.

In [26]:
encoder = OneHotEncoder(sparse_output=False)

y_train_encoded = encoder.fit_transform(np.array(train_small["label"]).reshape(-1, 1))
y_validation_encoded = encoder.transform(np.array(validation_small["label"]).reshape(-1, 1))
y_test_encoded = encoder.transform(np.array(test_small["label"]).reshape(-1, 1))

training_set = Dataloader(train_small["path"], y_train_encoded, 64)
test_set = Dataloader(test_small["path"], y_test_encoded, 64)
validation_set = Dataloader(validation_small["path"], y_validation_encoded, 64)


<details>
    <summary>
    GPT
    </summary>
    how many neurons should a dense layer in a convolutional neural network have?
</details>

In [24]:
model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Conv2D(
            8, (3, 3), activation="relu", input_shape=((256, 256, 3)), padding="same"
        ),
        tf.keras.layers.MaxPooling2D(2, 2),
        tf.keras.layers.Conv2D(16, (3, 3), activation="relu", padding="same"),
        tf.keras.layers.MaxPooling2D(2, 2),
        tf.keras.layers.Conv2D(32, (3, 3), activation="relu", padding="same"),
        tf.keras.layers.MaxPooling2D(2, 2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation="relu"),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(38, activation="softmax"),
    ]
)
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_31 (Conv2D)          (None, 256, 256, 8)       224       
                                                                 
 max_pooling2d_31 (MaxPoolin  (None, 128, 128, 8)      0         
 g2D)                                                            
                                                                 
 conv2d_32 (Conv2D)          (None, 128, 128, 16)      1168      
                                                                 
 max_pooling2d_32 (MaxPoolin  (None, 64, 64, 16)       0         
 g2D)                                                            
                                                                 
 conv2d_33 (Conv2D)          (None, 64, 64, 32)        4640      
                                                                 
 max_pooling2d_33 (MaxPoolin  (None, 32, 32, 32)      

In [27]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [28]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=3, restore_best_weights=True
)

history = model.fit(
    training_set,
    batch_size=64,
    epochs=20,
    verbose=1,
    validation_data=(validation_set),
    callbacks=[callback],
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
