In [None]:
import copy

import numpy as np
import matplotlib.pyplot as plt

# Torch imports
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader
from torch.utils.data import random_split

from torchvision import datasets, transforms, models

# For saving images
import pandas as pd
import requests
import shutil
import os
from PIL import Image

# Metrics
from tqdm.auto import tqdm
import time

In [None]:
# Get all the images from their urls, resize them, and save them locally
folder = "all-plant-data"
new_folder = "plant-images-64"
temp = os.path.join(new_folder, "temp")
rescale = 64
start = False
files = os.listdir(folder)

for file in tqdm(files, desc='Plants Completed: '):
    df = pd.read_csv(os.path.join(folder, file))
    plant = str(file)[:-4]

    for idx, url in enumerate(tqdm(df['image_url'], desc=f'{plant}: ')):
        attempts = 0

        while attempts < 5:
            try:
                filepath = os.path.join(new_folder, plant, str(idx))

                r = requests.get(url, stream=True)
                r.raw_decode_content = True

                with open(temp, 'wb') as f_out:
                    shutil.copyfileobj(r.raw, f_out)

                image = Image.open(temp)
                image = image.resize((rescale, rescale))
                image.save(str(filepath) + ".png")

                if attempts > 0:
                    print(f"Succeeded after {attempts + 1} tries")
                break

            except requests.exceptions.ConnectionError:
                print(f"ConnectionError: Could not get image {idx} of file {file}")
                if attempts < 4: print("Attempting to retry...")
                else: print("Failed 5 times, moving on to next image")
                attempts += 1

# Time Taken to download all 69,850 images: 6:51:48

In [None]:
is_native = set()
for file in os.listdir('native-plant-data'):
    is_native.add(str(file)[:-4])

subset_native = set()

for idx, file in enumerate(os.listdir('plant-images')):
    if str(file) in is_native: subset_native.add(idx)

print(subset_native)

In [None]:
# Transfer Learning with ResNet50
class ResNetClassifier(nn.Module):
    def __init__(self, lr, tol, batch_size, epochs, num_classes):
        super(ResNetClassifier, self).__init__()

        # Initialization
        self.batch_size = batch_size
        self.epochs = epochs
        self.num_classes = num_classes
        self.lr = lr
        self.tol = tol
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        # For plotting
        self.loss_history = []
        self.acc_history = []
        self.epoch_acc_history = []
        self.val_loss_history = []
        self.val_acc_history = []

        self.test_acc = -1
        self.test_loss = -1
        self.sub_acc = -1

        # Model
        self.model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT, download=True).to(self.device)

        self.in_dims = self.model.fc.in_features
        self.model.fc = nn.Linear(in_features=self.in_dims, out_features=self.num_classes)

        # Optimizer and loss function
        self.loss = nn.CrossEntropyLoss()
        self.optimizer = optim.AdamW(self.parameters(), lr = self.lr)

        # Move model to the GPU and get data
        self.to(self.device)

        self.train_data_loader = None
        self.val_data_loader = None
        self.test_data_loader = None
        self.get_data()


    # Forward pass for the model
    def forward(self, batch_data):
        batch_data = batch_data.clone().detach().to(self.device)
        out = self.model(batch_data)
        return out


    # Get data to train the model
    def get_data(self):
        # Data augmentation
        tf = transforms.Compose([transforms.RandomHorizontalFlip(), transforms.ToTensor()])

        invasive_data = datasets.ImageFolder('plant-images', transform=tf)

        train_data, val_data, test_data = random_split(invasive_data, [0.7, 0.15, 0.15], generator=torch.Generator().manual_seed(61))

        self.train_data_loader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=4)
        self.val_data_loader = DataLoader(val_data, batch_size=self.batch_size, shuffle=True, num_workers=4)
        self.test_data_loader = DataLoader(test_data, batch_size=self.batch_size, shuffle=True, num_workers=4)


    # Train the model
    def train_(self):
        stop = False
        time_started = time.time()

        best_model = copy.deepcopy(model.state_dict())
        best_loss = 10000000.0
        best_acc = 0
        past = 0

        for i in range(self.epochs):
            header = f"Epoch {i+1}/{self.epochs}"
            print(header)
            print("-" * len(header))

            loader = self.train_data_loader
            for phase in ['train', 'validate']:
                if phase == 'train':
                    self.train()
                else:
                    self.eval()
                    loader = self.val_data_loader

                # For plotting and statistics
                epoch_loss = 0
                epoch_acc = []

                # Main training loop
                for in_data, label in loader:
                    self.optimizer.zero_grad()

                    # Get model prediction and determine how well it did
                    label = label.to(self.device)
                    prediction = self.forward(in_data)
                    classes = torch.argmax(prediction, dim=1)

                    wrong = torch.where(torch.tensor(classes != label).to(self.device),
                                        torch.tensor([1.]).to(self.device),
                                        torch.tensor([0.]).to(self.device))

                    # Calculate loss and accuracy
                    acc = 1 - torch.sum(wrong) / self.batch_size
                    batch_loss = self.loss(prediction, label)
                    epoch_loss += batch_loss.item()
                    epoch_acc.append(acc.item())

                    # Train through backpropagation
                    if phase == 'train':
                        batch_loss.backward()
                        self.optimizer.step()

                        # Statistics
                        self.acc_history.append(acc.item())


                elapsed = time.time() - time_started
                print(f"Finished {phase} with loss {epoch_loss} and accuracy of {np.mean(epoch_acc)} in time "
                      f"{int((elapsed // 60) // 60)}:{int(elapsed // 60)}:{int(elapsed % 60)}")

                if phase == 'train':
                    self.loss_history.append(epoch_loss)
                    self.epoch_acc_history.append(np.mean(epoch_acc))
                else:
                    self.val_loss_history.append(epoch_loss)
                    self.val_acc_history.append(np.mean(epoch_acc))
                    best_acc = max(best_acc, np.mean(epoch_acc).item())

                    if (best_loss - epoch_loss > 0.03 * best_loss and best_loss - epoch_loss > 4) or (best_loss - epoch_loss >= 50):
                        past = 0
                        best_loss = epoch_loss
                        best_model = copy.deepcopy(model.state_dict())
                    else:
                        past += 1
                        print("No improvement over last epoch.")
                        if past >= self.tol:
                            print("No improvement in model loss. Early stopping...")
                            stop = True
                            break

            print()
            if stop: break

        print("Training complete...")
        print(f"Best validation loss: {best_loss}")
        print(f"Best validation accuracy: {best_acc}")

        model.load_state_dict(best_model)
        return model


    def test_(self):
        self.eval()

        time_started = time.time()
        total_loss = 0
        total_acc = []
        total_sub_acc = []

        for in_data, label in self.test_data_loader:

            # Compare correct answers to model
            label = label.to(self.device)
            prediction = self.forward(in_data)
            classes = torch.argmax(prediction, dim=1)

            wrong = torch.where(torch.tensor(classes != label).to(self.device),
                                torch.tensor([1.]).to(self.device),
                                torch.tensor([0.]).to(self.device))

            wrong_sub = 0
            for i in range(len(classes)):
                if (classes[i].item() in subset_native) != (label[i].item() in subset_native): wrong_sub += 1


            acc = 1 - torch.sum(wrong) / self.batch_size
            sub_acc = 1 - wrong_sub / self.batch_size
            batch_loss = self.loss(prediction, label)

            total_acc.append(acc.item())
            total_sub_acc.append(sub_acc)
            total_loss += batch_loss.item()

        self.test_acc = np.mean(total_acc)
        self.test_loss = total_loss
        self.sub_acc = np.mean(total_sub_acc)
        elapsed = time.time() - time_started
        print(f"Finished with loss {total_loss} and accuracy of {np.mean(total_acc)} in {int(elapsed // 60)}:{int(elapsed % 60)}")
        print(f"Testing Subset Accuracy (Invasive/Native): {model.sub_acc}")

In [None]:
model = ResNetClassifier(lr=0.001, tol=2, batch_size=32, epochs=10, num_classes=23)
model = model.train_()
model.test_()

In [None]:
torch.save(model.state_dict(), 'model-v4')

In [None]:
v = input("Enter version #: ")
fig = int(input("Enter figure #: "))
epoch_dims = np.arange(1, len(model.loss_history) + 1, 1)
path = 'graphs-analysis/'

# Graph 1
plt.plot(model.loss_history, marker='o', color='darkolivegreen')
plt.title(f"Figure {fig}: Cross-Entropy Training Loss vs. Number of Epochs\nfor Version {v} of the Neural Network")
plt.xlabel("Number of Epochs")
plt.ylabel("Cross-Entropy Training Loss")
plt.xticks(np.arange(len(model.loss_history)), epoch_dims)
plt.grid(axis='y')
plt.savefig(path + 'v' + v + "-train-loss-graph.png", bbox_inches='tight')
plt.show()

# Graph 2
plt.plot(model.val_loss_history, marker='o', color='darkolivegreen')
plt.title(
    f"Figure {fig + 1}: Cross-Entropy Validation Loss vs. Number of Epochs\nfor Version {v} of the Neural Network")
plt.xlabel("Number of Epochs")
plt.ylabel("Cross-Entropy Validation Loss")
plt.xticks(np.arange(len(model.val_loss_history)), epoch_dims)
plt.grid(axis='y')
plt.savefig(path + 'v' + v + "-val-loss-graph.png", bbox_inches='tight')
plt.show()

# Graph 3
plt.plot(model.epoch_acc_history, marker='o', color='darkolivegreen')
plt.title(f"Figure {fig + 2}: Training Accuracy vs. Number of Epochs\nfor Version {v} of the Neural Network")
plt.xlabel("Number of Epochs")
plt.ylabel("Accuracy (Fraction)")
plt.xticks(np.arange(len(model.epoch_acc_history)), epoch_dims)
plt.grid(axis='y')
plt.savefig(path + 'v' + v + "-epoch-acc-graph.png", bbox_inches='tight')
plt.show()

# Graph 4
plt.plot(model.val_acc_history, marker='o', color='darkolivegreen')
plt.title(f"Figure {fig + 3}: Validation Accuracy vs. Number of Epochs\nfor Version {v} of the Neural Network")
plt.xlabel("Number of Epochs")
plt.ylabel("Accuracy (Fraction)")
plt.xticks(np.arange(len(model.val_acc_history)), epoch_dims)
plt.grid(axis='y')
plt.savefig(path + 'v' + v + "-val-acc-graph.png", bbox_inches='tight')
plt.show()

In [None]:
def write_to(fw, arr, header):
    fw.write(header + '\n[' + str(arr[0]))
    for num in range(1, len(arr)):
        fw.write(', ' + str(arr[num]))
    fw.write(']\n\n')


try: v
except NameError: v = input("Enter version: ")

with open(f'graphs-analysis/v{v}-data.txt', 'w') as f_out:
    write_to(f_out, model.loss_history, "Training Loss")
    write_to(f_out, model.val_loss_history, "Validation Loss")
    write_to(f_out, model.epoch_acc_history, "Training Accuracy (Per Epoch)")
    write_to(f_out, model.val_acc_history, "Validation Accuracy")

    f_out.write(f"Testing Loss, Accuracy\n{model.test_loss}, {model.test_acc}\n\n")

    write_to(f_out, model.acc_history, "Training Accuracy (Per Batch)")