# Experiment 1

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports

In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler

from plant_village_dataset import PlantVillageDataset

## Prepare Data

Here we define the split according to project specifications, with the decision made to interpret 10% of the dataset to be used for training as including both the training and validation data, and the 10% for testing to be used wholly for testing proper.

In [19]:
def split(dataset, batch_size=128):
    """Splits the data into unlabeled, training, validation, and testing.

    Args:
        dataset (Dataset): Dataset to produce splits from
        batch_size (int, optional): Batch size for the data loaders. Defaults to 128.

    Returns:
        tuple: Tuple containing unlabeled, training, validation, and test data loaders.
    """
    labels = np.array([label for _, label in dataset])

    unlabeled_indices, labeled_indices = train_test_split(np.arange(len(dataset)),
                                                          test_size=0.2,
                                                          stratify=labels)
    train_val_indices, test_indices = train_test_split(labeled_indices,
                                                       test_size=0.5,
                                                       stratify=labels)
    train_indices, val_indices = train_test_split(train_val_indices,
                                                  test_size=0.2,
                                                  stratify=labels)

    unlabeled_sampler = SubsetRandomSampler(unlabeled_indices)
    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)
    test_sampler = SubsetRandomSampler(test_indices)

    unlabeled_loader = DataLoader(dataset, batch_size=batch_size, sampler=unlabeled_sampler)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)
    test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)

    return unlabeled_loader, train_loader, val_loader, test_loader

In [20]:
dataset = PlantVillageDataset('images')

Loading Plant Village
 - Normalizing dataset


 - Calculating mean and standard deviation: 100%|██████████| 434/434 [01:00<00:00,  7.18it/s]

 - Normalized dataset:
  - Mean: [0.4673, 0.4897, 0.4125]
  - Standard deviation: [0.1670, 0.1398, 0.1845]





## Run 1

### Split

In [None]:
unlabeled_loader, train_loader, val_loader, test_loader = split(dataset)

### Train

In [21]:
# Declare ResNet50
# Train ResNet50

# Declare UNetAutoEncoder
# Train UNetAutoEncoder

# Extract Encoder from UNetAutoEncoder
# Declare 2 Encoder + ResNet50 Frankensteins

# Train one Frankenstein with the Encoder's weights set to eval() (Frozen)

# Train the second Frankenstein normally

### Test

## Run 2

### Split

### Train

### Test

## Results